20 files changed, 686 insertions, 345 deletions
diff --git a/library/std/src/sys/windows/alloc.rs b/library/std/src/sys/windows/alloc.rs
index fdc81cdea..d53ea1600 100644
--- a/library/std/src/sys/windows/alloc.rs
+++ b/library/std/src/sys/windows/alloc.rs
@@ -16,6 +16,7 @@ mod tests;
 // Flag to indicate that the memory returned by `HeapAlloc` should be zeroed.
 const HEAP_ZERO_MEMORY: c::DWORD = 0x00000008;
 
+#[link(name = "kernel32")]
 extern "system" {
     // Get a handle to the default heap of the current process, or null if the operation fails.
     //
@@ -168,7 +169,7 @@ unsafe fn allocate(layout: Layout, zeroed: bool) -> *mut u8 {
         // SAFETY: Because the size and alignment of a header is <= `MIN_ALIGN` and `aligned`
         // is aligned to at least `MIN_ALIGN` and has at least `MIN_ALIGN` bytes of padding before
         // it, it is safe to write a header directly before it.
-        unsafe { ptr::write((aligned as *mut Header).offset(-1), Header(ptr)) };
+        unsafe { ptr::write((aligned as *mut Header).sub(1), Header(ptr)) };
 
         // SAFETY: The returned pointer does not point to the to the start of an allocated block,
         // but there is a header readable directly before it containing the location of the start
@@ -213,7 +214,7 @@ unsafe impl GlobalAlloc for System {
 
                 // SAFETY: Because of the contract of `System`, `ptr` is guaranteed to be non-null
                 // and have a header readable directly before it.
-                unsafe { ptr::read((ptr as *mut Header).offset(-1)).0 }
+                unsafe { ptr::read((ptr as *mut Header).sub(1)).0 }
             }
         };
 
diff --git a/library/std/src/sys/windows/c.rs b/library/std/src/sys/windows/c.rs
index 478068c73..be6fc2ebb 100644
--- a/library/std/src/sys/windows/c.rs
+++ b/library/std/src/sys/windows/c.rs
@@ -66,10 +66,12 @@ pub type LPSYSTEM_INFO = *mut SYSTEM_INFO;
 pub type LPWSABUF = *mut WSABUF;
 pub type LPWSAOVERLAPPED = *mut c_void;
 pub type LPWSAOVERLAPPED_COMPLETION_ROUTINE = *mut c_void;
+pub type BCRYPT_ALG_HANDLE = LPVOID;
 
 pub type PCONDITION_VARIABLE = *mut CONDITION_VARIABLE;
 pub type PLARGE_INTEGER = *mut c_longlong;
 pub type PSRWLOCK = *mut SRWLOCK;
+pub type LPINIT_ONCE = *mut INIT_ONCE;
 
 pub type SOCKET = crate::os::windows::raw::SOCKET;
 pub type socklen_t = c_int;
@@ -125,6 +127,10 @@ pub const SECURITY_SQOS_PRESENT: DWORD = 0x00100000;
 
 pub const FIONBIO: c_ulong = 0x8004667e;
 
+pub const MAX_PATH: usize = 260;
+
+pub const FILE_TYPE_PIPE: u32 = 3;
+
 #[repr(C)]
 #[derive(Copy)]
 pub struct WIN32_FIND_DATAW {
@@ -193,6 +199,9 @@ pub const DUPLICATE_SAME_ACCESS: DWORD = 0x00000002;
 
 pub const CONDITION_VARIABLE_INIT: CONDITION_VARIABLE = CONDITION_VARIABLE { ptr: ptr::null_mut() };
 pub const SRWLOCK_INIT: SRWLOCK = SRWLOCK { ptr: ptr::null_mut() };
+pub const INIT_ONCE_STATIC_INIT: INIT_ONCE = INIT_ONCE { ptr: ptr::null_mut() };
+
+pub const INIT_ONCE_INIT_FAILED: DWORD = 0x00000004;
 
 pub const DETACHED_PROCESS: DWORD = 0x00000008;
 pub const CREATE_NEW_PROCESS_GROUP: DWORD = 0x00000200;
@@ -285,6 +294,8 @@ pub fn nt_success(status: NTSTATUS) -> bool {
     status >= 0
 }
 
+// "RNG\0"
+pub const BCRYPT_RNG_ALGORITHM: &[u16] = &[b'R' as u16, b'N' as u16, b'G' as u16, 0];
 pub const BCRYPT_USE_SYSTEM_PREFERRED_RNG: DWORD = 0x00000002;
 
 #[repr(C)]
@@ -455,6 +466,12 @@ pub enum FILE_INFO_BY_HANDLE_CLASS {
 }
 
 #[repr(C)]
+pub struct FILE_ATTRIBUTE_TAG_INFO {
+    pub FileAttributes: DWORD,
+    pub ReparseTag: DWORD,
+}
+
+#[repr(C)]
 pub struct FILE_DISPOSITION_INFO {
     pub DeleteFile: BOOLEAN,
 }
@@ -501,6 +518,8 @@ pub struct FILE_END_OF_FILE_INFO {
     pub EndOfFile: LARGE_INTEGER,
 }
 
+/// NB: Use carefully! In general using this as a reference is likely to get the
+/// provenance wrong for the `rest` field!
 #[repr(C)]
 pub struct REPARSE_DATA_BUFFER {
     pub ReparseTag: c_uint,
@@ -509,6 +528,8 @@ pub struct REPARSE_DATA_BUFFER {
     pub rest: (),
 }
 
+/// NB: Use carefully! In general using this as a reference is likely to get the
+/// provenance wrong for the `PathBuffer` field!
 #[repr(C)]
 pub struct SYMBOLIC_LINK_REPARSE_BUFFER {
     pub SubstituteNameOffset: c_ushort,
@@ -519,6 +540,14 @@ pub struct SYMBOLIC_LINK_REPARSE_BUFFER {
     pub PathBuffer: WCHAR,
 }
 
+/// NB: Use carefully! In general using this as a reference is likely to get the
+/// provenance wrong for the `PathBuffer` field!
+#[repr(C)]
+pub struct FILE_NAME_INFO {
+    pub FileNameLength: DWORD,
+    pub FileName: [WCHAR; 1],
+}
+
 #[repr(C)]
 pub struct MOUNT_POINT_REPARSE_BUFFER {
     pub SubstituteNameOffset: c_ushort,
@@ -550,6 +579,10 @@ pub struct CONDITION_VARIABLE {
 pub struct SRWLOCK {
     pub ptr: LPVOID,
 }
+#[repr(C)]
+pub struct INIT_ONCE {
+    pub ptr: LPVOID,
+}
 
 #[repr(C)]
 pub struct REPARSE_MOUNTPOINT_DATA_BUFFER {
@@ -802,10 +835,6 @@ if #[cfg(not(target_vendor = "uwp"))] {
 
     #[link(name = "advapi32")]
     extern "system" {
-        // Forbidden when targeting UWP
-        #[link_name = "SystemFunction036"]
-        pub fn RtlGenRandom(RandomBuffer: *mut u8, RandomBufferLength: ULONG) -> BOOLEAN;
-
         // Allowed but unused by UWP
         pub fn OpenProcessToken(
             ProcessHandle: HANDLE,
@@ -944,6 +973,7 @@ extern "system" {
     pub fn TlsAlloc() -> DWORD;
     pub fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
     pub fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
+    pub fn TlsFree(dwTlsIndex: DWORD) -> BOOL;
     pub fn GetLastError() -> DWORD;
     pub fn QueryPerformanceFrequency(lpFrequency: *mut LARGE_INTEGER) -> BOOL;
     pub fn QueryPerformanceCounter(lpPerformanceCount: *mut LARGE_INTEGER) -> BOOL;
@@ -1086,6 +1116,7 @@ extern "system" {
         lpFileInformation: LPVOID,
         dwBufferSize: DWORD,
     ) -> BOOL;
+    pub fn GetFileType(hfile: HANDLE) -> DWORD;
     pub fn SleepConditionVariableSRW(
         ConditionVariable: PCONDITION_VARIABLE,
         SRWLock: PSRWLOCK,
@@ -1103,6 +1134,14 @@ extern "system" {
     pub fn TryAcquireSRWLockExclusive(SRWLock: PSRWLOCK) -> BOOLEAN;
     pub fn TryAcquireSRWLockShared(SRWLock: PSRWLOCK) -> BOOLEAN;
 
+    pub fn InitOnceBeginInitialize(
+        lpInitOnce: LPINIT_ONCE,
+        dwFlags: DWORD,
+        fPending: LPBOOL,
+        lpContext: *mut LPVOID,
+    ) -> BOOL;
+    pub fn InitOnceComplete(lpInitOnce: LPINIT_ONCE, dwFlags: DWORD, lpContext: LPVOID) -> BOOL;
+
     pub fn CompareStringOrdinal(
         lpString1: LPCWSTR,
         cchCount1: c_int,
@@ -1217,11 +1256,18 @@ extern "system" {
     // >= Vista / Server 2008
     // https://docs.microsoft.com/en-us/windows/win32/api/bcrypt/nf-bcrypt-bcryptgenrandom
     pub fn BCryptGenRandom(
-        hAlgorithm: LPVOID,
+        hAlgorithm: BCRYPT_ALG_HANDLE,
         pBuffer: *mut u8,
         cbBuffer: ULONG,
         dwFlags: ULONG,
     ) -> NTSTATUS;
+    pub fn BCryptOpenAlgorithmProvider(
+        phalgorithm: *mut BCRYPT_ALG_HANDLE,
+        pszAlgId: LPCWSTR,
+        pszimplementation: LPCWSTR,
+        dwflags: ULONG,
+    ) -> NTSTATUS;
+    pub fn BCryptCloseAlgorithmProvider(hAlgorithm: BCRYPT_ALG_HANDLE, dwFlags: ULONG) -> NTSTATUS;
 }
 
 // Functions that aren't available on every version of Windows that we support,
@@ -1251,17 +1297,14 @@ compat_fn_with_fallback! {
 }
 
 compat_fn_optional! {
-    pub static SYNCH_API: &CStr = ansi_str!("api-ms-win-core-synch-l1-2-0");
-
-    // >= Windows 8 / Server 2012
-    // https://docs.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-waitonaddress
+    crate::sys::compat::load_synch_functions();
     pub fn WaitOnAddress(
         Address: LPVOID,
         CompareAddress: LPVOID,
         AddressSize: SIZE_T,
         dwMilliseconds: DWORD
-    ) -> BOOL;
-    pub fn WakeByAddressSingle(Address: LPVOID) -> ();
+    );
+    pub fn WakeByAddressSingle(Address: LPVOID);
 }
 
 compat_fn_with_fallback! {
diff --git a/library/std/src/sys/windows/cmath.rs b/library/std/src/sys/windows/cmath.rs
index 1a5421fac..43ab8c7ee 100644
--- a/library/std/src/sys/windows/cmath.rs
+++ b/library/std/src/sys/windows/cmath.rs
@@ -44,7 +44,7 @@ mod shims {
 }
 
 // On 32-bit x86 MSVC these functions aren't defined, so we just define shims
-// which promote everything fo f64, perform the calculation, and then demote
+// which promote everything to f64, perform the calculation, and then demote
 // back to f32. While not precisely correct should be "correct enough" for now.
 #[cfg(all(target_env = "msvc", target_arch = "x86"))]
 mod shims {
diff --git a/library/std/src/sys/windows/compat.rs b/library/std/src/sys/windows/compat.rs
index ccc90177a..7dff81ecb 100644
--- a/library/std/src/sys/windows/compat.rs
+++ b/library/std/src/sys/windows/compat.rs
@@ -7,52 +7,66 @@
 //! `GetModuleHandle` and `GetProcAddress` to look up DLL entry points at
 //! runtime.
 //!
-//! This implementation uses a static initializer to look up the DLL entry
-//! points. The CRT (C runtime) executes static initializers before `main`
-//! is called (for binaries) and before `DllMain` is called (for DLLs).
-//! This is the ideal time to look up DLL imports, because we are guaranteed
-//! that no other threads will attempt to call these entry points. Thus,
-//! we can look up the imports and store them in `static mut` fields
-//! without any synchronization.
+//! This is implemented simply by storing a function pointer in an atomic.
+//! Loading and calling this function will have little or no overhead
+//! compared with calling any other dynamically imported function.
 //!
-//! This has an additional advantage: Because the DLL import lookup happens
-//! at module initialization, the cost of these lookups is deterministic,
-//! and is removed from the code paths that actually call the DLL imports.
-//! That is, there is no unpredictable "cache miss" that occurs when calling
-//! a DLL import. For applications that benefit from predictable delays,
-//! this is a benefit. This also eliminates the comparison-and-branch
-//! from the hot path.
-//!
-//! Currently, the standard library uses only a small number of dynamic
-//! DLL imports. If this number grows substantially, then the cost of
-//! performing all of the lookups at initialization time might become
-//! substantial.
-//!
-//! The mechanism of registering a static initializer with the CRT is
-//! documented in
-//! [CRT Initialization](https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-initialization?view=msvc-160).
-//! It works by contributing a global symbol to the `.CRT$XCU` section.
-//! The linker builds a table of all static initializer functions.
-//! The CRT startup code then iterates that table, calling each
-//! initializer function.
-//!
-//! # **WARNING!!*
-//! The environment that a static initializer function runs in is highly
-//! constrained. There are **many** restrictions on what static initializers
-//! can safely do. Static initializer functions **MUST NOT** do any of the
-//! following (this list is not comprehensive):
-//! * touch any other static field that is used by a different static
-//!   initializer, because the order that static initializers run in
-//!   is not defined.
-//! * call `LoadLibrary` or any other function that acquires the DLL
-//!   loader lock.
-//! * call any Rust function or CRT function that touches any static
-//!   (global) state.
+//! The stored function pointer starts out as an importer function which will
+//! swap itself with the real function when it's called for the first time. If
+//! the real function can't be imported then a fallback function is used in its
+//! place. While this is low cost for the happy path (where the function is
+//! already loaded) it does mean there's some overhead the first time the
+//! function is called. In the worst case, multiple threads may all end up
+//! importing the same function unnecessarily.
 
 use crate::ffi::{c_void, CStr};
 use crate::ptr::NonNull;
+use crate::sync::atomic::Ordering;
 use crate::sys::c;
 
+// This uses a static initializer to preload some imported functions.
+// The CRT (C runtime) executes static initializers before `main`
+// is called (for binaries) and before `DllMain` is called (for DLLs).
+//
+// It works by contributing a global symbol to the `.CRT$XCT` section.
+// The linker builds a table of all static initializer functions.
+// The CRT startup code then iterates that table, calling each
+// initializer function.
+//
+// NOTE: User code should instead use .CRT$XCU to reliably run after std's initializer.
+// If you're reading this and would like a guarantee here, please
+// file an issue for discussion; currently we don't guarantee any functionality
+// before main.
+// See https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-initialization?view=msvc-170
+#[used]
+#[link_section = ".CRT$XCT"]
+static INIT_TABLE_ENTRY: unsafe extern "C" fn() = init;
+
+/// Preload some imported functions.
+///
+/// Note that any functions included here will be unconditionally loaded in
+/// the final binary, regardless of whether or not they're actually used.
+///
+/// Therefore, this should be limited to `compat_fn_optional` functions which
+/// must be preloaded or any functions where lazier loading demonstrates a
+/// negative performance impact in practical situations.
+///
+/// Currently we only preload `WaitOnAddress` and `WakeByAddressSingle`.
+unsafe extern "C" fn init() {
+    // In an exe this code is executed before main() so is single threaded.
+    // In a DLL the system's loader lock will be held thereby synchronizing
+    // access. So the same best practices apply here as they do to running in DllMain:
+    // https://docs.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-best-practices
+    //
+    // DO NOT do anything interesting or complicated in this function! DO NOT call
+    // any Rust functions or CRT functions if those functions touch any global state,
+    // because this function runs during global initialization. For example, DO NOT
+    // do any dynamic allocation, don't call LoadLibrary, etc.
+
+    // Attempt to preload the synch functions.
+    load_synch_functions();
+}
+
 /// Helper macro for creating CStrs from literals and symbol names.
 macro_rules! ansi_str {
     (sym $ident:ident) => {{
@@ -85,39 +99,6 @@ pub(crate) const fn const_cstr_from_bytes(bytes: &'static [u8]) -> &'static CStr
     unsafe { crate::ffi::CStr::from_bytes_with_nul_unchecked(bytes) }
 }
 
-#[used]
-#[link_section = ".CRT$XCU"]
-static INIT_TABLE_ENTRY: unsafe extern "C" fn() = init;
-
-/// This is where the magic preloading of symbols happens.
-///
-/// Note that any functions included here will be unconditionally included in
-/// the final binary, regardless of whether or not they're actually used.
-///
-/// Therefore, this is limited to `compat_fn_optional` functions which must be
-/// preloaded and any functions which may be more time sensitive, even for the first call.
-unsafe extern "C" fn init() {
-    // There is no locking here. This code is executed before main() is entered, and
-    // is guaranteed to be single-threaded.
-    //
-    // DO NOT do anything interesting or complicated in this function! DO NOT call
-    // any Rust functions or CRT functions if those functions touch any global state,
-    // because this function runs during global initialization. For example, DO NOT
-    // do any dynamic allocation, don't call LoadLibrary, etc.
-
-    if let Some(synch) = Module::new(c::SYNCH_API) {
-        // These are optional and so we must manually attempt to load them
-        // before they can be used.
-        c::WaitOnAddress::preload(synch);
-        c::WakeByAddressSingle::preload(synch);
-    }
-
-    if let Some(kernel32) = Module::new(c::KERNEL32) {
-        // Preloading this means getting a precise time will be as fast as possible.
-        c::GetSystemTimePreciseAsFileTime::preload(kernel32);
-    }
-}
-
 /// Represents a loaded module.
 ///
 /// Note that the modules std depends on must not be unloaded.
@@ -151,7 +132,7 @@ impl Module {
 macro_rules! compat_fn_with_fallback {
     (pub static $module:ident: &CStr = $name:expr; $(
         $(#[$meta:meta])*
-        pub fn $symbol:ident($($argname:ident: $argtype:ty),*) -> $rettype:ty $fallback_body:block
+        $vis:vis fn $symbol:ident($($argname:ident: $argtype:ty),*) -> $rettype:ty $fallback_body:block
     )*) => (
         pub static $module: &CStr = $name;
     $(
@@ -196,11 +177,6 @@ macro_rules! compat_fn_with_fallback {
                 $fallback_body
             }
 
-            #[allow(unused)]
-            pub(in crate::sys) fn preload(module: Module) {
-                load_from_module(Some(module));
-            }
-
             #[inline(always)]
             pub unsafe fn call($($argname: $argtype),*) -> $rettype {
                 let func: F = mem::transmute(PTR.load(Ordering::Relaxed));
@@ -208,66 +184,60 @@ macro_rules! compat_fn_with_fallback {
             }
         }
         $(#[$meta])*
-        pub use $symbol::call as $symbol;
+        $vis use $symbol::call as $symbol;
     )*)
 }
 
-/// A function that either exists or doesn't.
+/// Optionally loaded functions.
 ///
-/// NOTE: Optional functions must be preloaded in the `init` function above, or they will always be None.
+/// Actual loading of the function defers to $load_functions.
 macro_rules! compat_fn_optional {
-    (pub static $module:ident: &CStr = $name:expr; $(
-        $(#[$meta:meta])*
-        pub fn $symbol:ident($($argname:ident: $argtype:ty),*) -> $rettype:ty;
-    )*) => (
-        pub static $module: &CStr = $name;
+    ($load_functions:expr;
     $(
-        $(#[$meta])*
-        pub mod $symbol {
-            #[allow(unused_imports)]
-            use super::*;
-            use crate::mem;
-            use crate::sync::atomic::{AtomicPtr, Ordering};
-            use crate::sys::compat::Module;
-            use crate::ptr::{self, NonNull};
-
-            type F = unsafe extern "system" fn($($argtype),*) -> $rettype;
-
-            /// `PTR` will either be `null()` or set to the loaded function.
-            static PTR: AtomicPtr<c_void> = AtomicPtr::new(ptr::null_mut());
-
-            /// Only allow access to the function if it has loaded successfully.
-            #[inline(always)]
-            #[cfg(not(miri))]
-            pub fn option() -> Option<F> {
-                unsafe {
-                    NonNull::new(PTR.load(Ordering::Relaxed)).map(|f| mem::transmute(f))
+        $(#[$meta:meta])*
+        $vis:vis fn $symbol:ident($($argname:ident: $argtype:ty),*) $(-> $rettype:ty)?;
+    )+) => (
+        $(
+            pub mod $symbol {
+                use super::*;
+                use crate::ffi::c_void;
+                use crate::mem;
+                use crate::ptr::{self, NonNull};
+                use crate::sync::atomic::{AtomicPtr, Ordering};
+
+                pub(in crate::sys) static PTR: AtomicPtr<c_void> = AtomicPtr::new(ptr::null_mut());
+
+                type F = unsafe extern "system" fn($($argtype),*) $(-> $rettype)?;
+
+                #[inline(always)]
+                pub fn option() -> Option<F> {
+                    // Miri does not understand the way we do preloading
+                    // therefore load the function here instead.
+                    #[cfg(miri)] $load_functions;
+                    NonNull::new(PTR.load(Ordering::Relaxed)).map(|f| unsafe { mem::transmute(f) })
                 }
             }
+        )+
+    )
+}
 
-            // Miri does not understand the way we do preloading
-            // therefore load the function here instead.
-            #[cfg(miri)]
-            pub fn option() -> Option<F> {
-                let mut func = NonNull::new(PTR.load(Ordering::Relaxed));
-                if func.is_none() {
-                    unsafe { Module::new($module).map(preload) };
-                    func = NonNull::new(PTR.load(Ordering::Relaxed));
-                }
-                unsafe {
-                    func.map(|f| mem::transmute(f))
-                }
-            }
+/// Load all needed functions from "api-ms-win-core-synch-l1-2-0".
+pub(super) fn load_synch_functions() {
+    fn try_load() -> Option<()> {
+        const MODULE_NAME: &CStr = ansi_str!("api-ms-win-core-synch-l1-2-0");
+        const WAIT_ON_ADDRESS: &CStr = ansi_str!("WaitOnAddress");
+        const WAKE_BY_ADDRESS_SINGLE: &CStr = ansi_str!("WakeByAddressSingle");
+
+        // Try loading the library and all the required functions.
+        // If any step fails, then they all fail.
+        let library = unsafe { Module::new(MODULE_NAME) }?;
+        let wait_on_address = library.proc_address(WAIT_ON_ADDRESS)?;
+        let wake_by_address_single = library.proc_address(WAKE_BY_ADDRESS_SINGLE)?;
+
+        c::WaitOnAddress::PTR.store(wait_on_address.as_ptr(), Ordering::Relaxed);
+        c::WakeByAddressSingle::PTR.store(wake_by_address_single.as_ptr(), Ordering::Relaxed);
+        Some(())
+    }
 
-            #[allow(unused)]
-            pub(in crate::sys) fn preload(module: Module) {
-                unsafe {
-                    static SYMBOL_NAME: &CStr = ansi_str!(sym $symbol);
-                    if let Some(f) = module.proc_address(SYMBOL_NAME) {
-                        PTR.store(f.as_ptr(), Ordering::Relaxed);
-                    }
-                }
-            }
-        }
-    )*)
+    try_load();
 }
diff --git a/library/std/src/sys/windows/fs.rs b/library/std/src/sys/windows/fs.rs
index aed082b3e..378098038 100644
--- a/library/std/src/sys/windows/fs.rs
+++ b/library/std/src/sys/windows/fs.rs
@@ -1,9 +1,10 @@
 use crate::os::windows::prelude::*;
 
+use crate::borrow::Cow;
 use crate::ffi::OsString;
 use crate::fmt;
-use crate::io::{self, Error, IoSlice, IoSliceMut, ReadBuf, SeekFrom};
-use crate::mem;
+use crate::io::{self, BorrowedCursor, Error, IoSlice, IoSliceMut, SeekFrom};
+use crate::mem::{self, MaybeUninit};
 use crate::os::windows::io::{AsHandle, BorrowedHandle};
 use crate::path::{Path, PathBuf};
 use crate::ptr;
@@ -11,7 +12,7 @@ use crate::slice;
 use crate::sync::Arc;
 use crate::sys::handle::Handle;
 use crate::sys::time::SystemTime;
-use crate::sys::{c, cvt};
+use crate::sys::{c, cvt, Align8};
 use crate::sys_common::{AsInner, FromInner, IntoInner};
 use crate::thread;
 
@@ -326,9 +327,15 @@ impl File {
             cvt(c::GetFileInformationByHandle(self.handle.as_raw_handle(), &mut info))?;
             let mut reparse_tag = 0;
             if info.dwFileAttributes & c::FILE_ATTRIBUTE_REPARSE_POINT != 0 {
-                let mut b = [0; c::MAXIMUM_REPARSE_DATA_BUFFER_SIZE];
-                if let Ok((_, buf)) = self.reparse_point(&mut b) {
-                    reparse_tag = buf.ReparseTag;
+                let mut attr_tag: c::FILE_ATTRIBUTE_TAG_INFO = mem::zeroed();
+                cvt(c::GetFileInformationByHandleEx(
+                    self.handle.as_raw_handle(),
+                    c::FileAttributeTagInfo,
+                    ptr::addr_of_mut!(attr_tag).cast(),
+                    mem::size_of::<c::FILE_ATTRIBUTE_TAG_INFO>().try_into().unwrap(),
+                ))?;
+                if attr_tag.FileAttributes & c::FILE_ATTRIBUTE_REPARSE_POINT != 0 {
+                    reparse_tag = attr_tag.ReparseTag;
                 }
             }
             Ok(FileAttr {
@@ -389,9 +396,15 @@ impl File {
             attr.file_size = info.AllocationSize as u64;
             attr.number_of_links = Some(info.NumberOfLinks);
             if attr.file_type().is_reparse_point() {
-                let mut b = [0; c::MAXIMUM_REPARSE_DATA_BUFFER_SIZE];
-                if let Ok((_, buf)) = self.reparse_point(&mut b) {
-                    attr.reparse_tag = buf.ReparseTag;
+                let mut attr_tag: c::FILE_ATTRIBUTE_TAG_INFO = mem::zeroed();
+                cvt(c::GetFileInformationByHandleEx(
+                    self.handle.as_raw_handle(),
+                    c::FileAttributeTagInfo,
+                    ptr::addr_of_mut!(attr_tag).cast(),
+                    mem::size_of::<c::FILE_ATTRIBUTE_TAG_INFO>().try_into().unwrap(),
+                ))?;
+                if attr_tag.FileAttributes & c::FILE_ATTRIBUTE_REPARSE_POINT != 0 {
+                    attr.reparse_tag = attr_tag.ReparseTag;
                 }
             }
             Ok(attr)
@@ -415,8 +428,8 @@ impl File {
         self.handle.read_at(buf, offset)
     }
 
-    pub fn read_buf(&self, buf: &mut ReadBuf<'_>) -> io::Result<()> {
-        self.handle.read_buf(buf)
+    pub fn read_buf(&self, cursor: BorrowedCursor<'_>) -> io::Result<()> {
+        self.handle.read_buf(cursor)
     }
 
     pub fn write(&self, buf: &[u8]) -> io::Result<usize> {
@@ -458,38 +471,46 @@ impl File {
         Ok(Self { handle: self.handle.try_clone()? })
     }
 
-    fn reparse_point<'a>(
+    // NB: returned pointer is derived from `space`, and has provenance to
+    // match. A raw pointer is returned rather than a reference in order to
+    // avoid narrowing provenance to the actual `REPARSE_DATA_BUFFER`.
+    fn reparse_point(
         &self,
-        space: &'a mut [u8; c::MAXIMUM_REPARSE_DATA_BUFFER_SIZE],
-    ) -> io::Result<(c::DWORD, &'a c::REPARSE_DATA_BUFFER)> {
+        space: &mut Align8<[MaybeUninit<u8>]>,
+    ) -> io::Result<(c::DWORD, *const c::REPARSE_DATA_BUFFER)> {
         unsafe {
             let mut bytes = 0;
             cvt({
+                // Grab this in advance to avoid it invalidating the pointer
+                // we get from `space.0.as_mut_ptr()`.
+                let len = space.0.len();
                 c::DeviceIoControl(
                     self.handle.as_raw_handle(),
                     c::FSCTL_GET_REPARSE_POINT,
                     ptr::null_mut(),
                     0,
-                    space.as_mut_ptr() as *mut _,
-                    space.len() as c::DWORD,
+                    space.0.as_mut_ptr().cast(),
+                    len as c::DWORD,
                     &mut bytes,
                     ptr::null_mut(),
                 )
             })?;
-            Ok((bytes, &*(space.as_ptr() as *const c::REPARSE_DATA_BUFFER)))
+            const _: () = assert!(core::mem::align_of::<c::REPARSE_DATA_BUFFER>() <= 8);
+            Ok((bytes, space.0.as_ptr().cast::<c::REPARSE_DATA_BUFFER>()))
         }
     }
 
     fn readlink(&self) -> io::Result<PathBuf> {
-        let mut space = [0u8; c::MAXIMUM_REPARSE_DATA_BUFFER_SIZE];
+        let mut space = Align8([MaybeUninit::<u8>::uninit(); c::MAXIMUM_REPARSE_DATA_BUFFER_SIZE]);
         let (_bytes, buf) = self.reparse_point(&mut space)?;
         unsafe {
-            let (path_buffer, subst_off, subst_len, relative) = match buf.ReparseTag {
+            let (path_buffer, subst_off, subst_len, relative) = match (*buf).ReparseTag {
                 c::IO_REPARSE_TAG_SYMLINK => {
                     let info: *const c::SYMBOLIC_LINK_REPARSE_BUFFER =
-                        &buf.rest as *const _ as *const _;
+                        ptr::addr_of!((*buf).rest).cast();
+                    assert!(info.is_aligned());
                     (
-                        &(*info).PathBuffer as *const _ as *const u16,
+                        ptr::addr_of!((*info).PathBuffer).cast::<u16>(),
                         (*info).SubstituteNameOffset / 2,
                         (*info).SubstituteNameLength / 2,
                         (*info).Flags & c::SYMLINK_FLAG_RELATIVE != 0,
@@ -497,9 +518,10 @@ impl File {
                 }
                 c::IO_REPARSE_TAG_MOUNT_POINT => {
                     let info: *const c::MOUNT_POINT_REPARSE_BUFFER =
-                        &buf.rest as *const _ as *const _;
+                        ptr::addr_of!((*buf).rest).cast();
+                    assert!(info.is_aligned());
                     (
-                        &(*info).PathBuffer as *const _ as *const u16,
+                        ptr::addr_of!((*info).PathBuffer).cast::<u16>(),
                         (*info).SubstituteNameOffset / 2,
                         (*info).SubstituteNameLength / 2,
                         false,
@@ -512,7 +534,7 @@ impl File {
                     ));
                 }
             };
-            let subst_ptr = path_buffer.offset(subst_off as isize);
+            let subst_ptr = path_buffer.add(subst_off.into());
             let mut subst = slice::from_raw_parts(subst_ptr, subst_len as usize);
             // Absolute paths start with an NT internal namespace prefix `\??\`
             // We should not let it leak through.
@@ -551,6 +573,14 @@ impl File {
                 "Cannot set file timestamp to 0",
             ));
         }
+        let is_max =
+            |t: c::FILETIME| t.dwLowDateTime == c::DWORD::MAX && t.dwHighDateTime == c::DWORD::MAX;
+        if times.accessed.map_or(false, is_max) || times.modified.map_or(false, is_max) {
+            return Err(io::const_io_error!(
+                io::ErrorKind::InvalidInput,
+                "Cannot set file timestamp to 0xFFFF_FFFF_FFFF_FFFF",
+            ));
+        }
         cvt(unsafe {
             c::SetFileTime(self.as_handle(), None, times.accessed.as_ref(), times.modified.as_ref())
         })?;
@@ -649,27 +679,31 @@ impl File {
 
 /// A buffer for holding directory entries.
 struct DirBuff {
-    buffer: Vec<u8>,
+    buffer: Box<Align8<[MaybeUninit<u8>; Self::BUFFER_SIZE]>>,
 }
 impl DirBuff {
+    const BUFFER_SIZE: usize = 1024;
     fn new() -> Self {
-        const BUFFER_SIZE: usize = 1024;
-        Self { buffer: vec![0_u8; BUFFER_SIZE] }
+        Self {
+            // Safety: `Align8<[MaybeUninit<u8>; N]>` does not need
+            // initialization.
+            buffer: unsafe { Box::new_uninit().assume_init() },
+        }
     }
     fn capacity(&self) -> usize {
-        self.buffer.len()
+        self.buffer.0.len()
     }
     fn as_mut_ptr(&mut self) -> *mut u8 {
-        self.buffer.as_mut_ptr().cast()
+        self.buffer.0.as_mut_ptr().cast()
     }
     /// Returns a `DirBuffIter`.
     fn iter(&self) -> DirBuffIter<'_> {
         DirBuffIter::new(self)
     }
 }
-impl AsRef<[u8]> for DirBuff {
-    fn as_ref(&self) -> &[u8] {
-        &self.buffer
+impl AsRef<[MaybeUninit<u8>]> for DirBuff {
+    fn as_ref(&self) -> &[MaybeUninit<u8>] {
+        &self.buffer.0
     }
 }
 
@@ -677,7 +711,7 @@ impl AsRef<[u8]> for DirBuff {
 ///
 /// Currently only returns file names (UTF-16 encoded).
 struct DirBuffIter<'a> {
-    buffer: Option<&'a [u8]>,
+    buffer: Option<&'a [MaybeUninit<u8>]>,
     cursor: usize,
 }
 impl<'a> DirBuffIter<'a> {
@@ -686,23 +720,34 @@ impl<'a> DirBuffIter<'a> {
     }
 }
 impl<'a> Iterator for DirBuffIter<'a> {
-    type Item = (&'a [u16], bool);
+    type Item = (Cow<'a, [u16]>, bool);
     fn next(&mut self) -> Option<Self::Item> {
         use crate::mem::size_of;
         let buffer = &self.buffer?[self.cursor..];
 
         // Get the name and next entry from the buffer.
-        // SAFETY: The buffer contains a `FILE_ID_BOTH_DIR_INFO` struct but the
-        // last field (the file name) is unsized. So an offset has to be
-        // used to get the file name slice.
+        // SAFETY:
+        // - The buffer contains a `FILE_ID_BOTH_DIR_INFO` struct but the last
+        //   field (the file name) is unsized. So an offset has to be used to
+        //   get the file name slice.
+        // - The OS has guaranteed initialization of the fields of
+        //   `FILE_ID_BOTH_DIR_INFO` and the trailing filename (for at least
+        //   `FileNameLength` bytes)
         let (name, is_directory, next_entry) = unsafe {
             let info = buffer.as_ptr().cast::<c::FILE_ID_BOTH_DIR_INFO>();
-            let next_entry = (*info).NextEntryOffset as usize;
-            let name = crate::slice::from_raw_parts(
-                (*info).FileName.as_ptr().cast::<u16>(),
-                (*info).FileNameLength as usize / size_of::<u16>(),
+            // While this is guaranteed to be aligned in documentation for
+            // https://docs.microsoft.com/en-us/windows/win32/api/winbase/ns-winbase-file_id_both_dir_info
+            // it does not seem that reality is so kind, and assuming this
+            // caused crashes in some cases (https://github.com/rust-lang/rust/issues/104530)
+            // presumably, this can be blamed on buggy filesystem drivers, but who knows.
+            let next_entry = ptr::addr_of!((*info).NextEntryOffset).read_unaligned() as usize;
+            let length = ptr::addr_of!((*info).FileNameLength).read_unaligned() as usize;
+            let attrs = ptr::addr_of!((*info).FileAttributes).read_unaligned();
+            let name = from_maybe_unaligned(
+                ptr::addr_of!((*info).FileName).cast::<u16>(),
+                length / size_of::<u16>(),
             );
-            let is_directory = ((*info).FileAttributes & c::FILE_ATTRIBUTE_DIRECTORY) != 0;
+            let is_directory = (attrs & c::FILE_ATTRIBUTE_DIRECTORY) != 0;
 
             (name, is_directory, next_entry)
         };
@@ -715,13 +760,21 @@ impl<'a> Iterator for DirBuffIter<'a> {
 
         // Skip `.` and `..` pseudo entries.
         const DOT: u16 = b'.' as u16;
-        match name {
+        match &name[..] {
             [DOT] | [DOT, DOT] => self.next(),
             _ => Some((name, is_directory)),
         }
     }
 }
 
+unsafe fn from_maybe_unaligned<'a>(p: *const u16, len: usize) -> Cow<'a, [u16]> {
+    if p.is_aligned() {
+        Cow::Borrowed(crate::slice::from_raw_parts(p, len))
+    } else {
+        Cow::Owned((0..len).map(|i| p.add(i).read_unaligned()).collect())
+    }
+}
+
 /// Open a link relative to the parent directory, ensure no symlinks are followed.
 fn open_link_no_reparse(parent: &File, name: &[u16], access: u32) -> io::Result<File> {
     // This is implemented using the lower level `NtCreateFile` function as
@@ -1077,13 +1130,13 @@ fn remove_dir_all_iterative(f: &File, delete: fn(&File) -> io::Result<()>) -> io
             if is_directory {
                 let child_dir = open_link_no_reparse(
                     &dir,
-                    name,
+                    &name,
                     c::SYNCHRONIZE | c::DELETE | c::FILE_LIST_DIRECTORY,
                 )?;
                 dirlist.push(child_dir);
             } else {
                 for i in 1..=MAX_RETRIES {
-                    let result = open_link_no_reparse(&dir, name, c::SYNCHRONIZE | c::DELETE);
+                    let result = open_link_no_reparse(&dir, &name, c::SYNCHRONIZE | c::DELETE);
                     match result {
                         Ok(f) => delete(&f)?,
                         // Already deleted, so skip.
@@ -1337,18 +1390,19 @@ fn symlink_junction_inner(original: &Path, junction: &Path) -> io::Result<()> {
     let h = f.as_inner().as_raw_handle();
 
     unsafe {
-        let mut data = [0u8; c::MAXIMUM_REPARSE_DATA_BUFFER_SIZE];
-        let db = data.as_mut_ptr() as *mut c::REPARSE_MOUNTPOINT_DATA_BUFFER;
-        let buf = &mut (*db).ReparseTarget as *mut c::WCHAR;
+        let mut data = Align8([MaybeUninit::<u8>::uninit(); c::MAXIMUM_REPARSE_DATA_BUFFER_SIZE]);
+        let data_ptr = data.0.as_mut_ptr();
+        let db = data_ptr.cast::<c::REPARSE_MOUNTPOINT_DATA_BUFFER>();
+        let buf = ptr::addr_of_mut!((*db).ReparseTarget).cast::<c::WCHAR>();
         let mut i = 0;
         // FIXME: this conversion is very hacky
         let v = br"\??\";
         let v = v.iter().map(|x| *x as u16);
         for c in v.chain(original.as_os_str().encode_wide()) {
-            *buf.offset(i) = c;
+            *buf.add(i) = c;
             i += 1;
         }
-        *buf.offset(i) = 0;
+        *buf.add(i) = 0;
         i += 1;
         (*db).ReparseTag = c::IO_REPARSE_TAG_MOUNT_POINT;
         (*db).ReparseTargetMaximumLength = (i * 2) as c::WORD;
@@ -1359,7 +1413,7 @@ fn symlink_junction_inner(original: &Path, junction: &Path) -> io::Result<()> {
         cvt(c::DeviceIoControl(
             h as *mut _,
             c::FSCTL_SET_REPARSE_POINT,
-            data.as_ptr() as *mut _,
+            data_ptr.cast(),
             (*db).ReparseDataLength + 8,
             ptr::null_mut(),
             0,
diff --git a/library/std/src/sys/windows/handle.rs b/library/std/src/sys/windows/handle.rs
index e24b09cc9..ae33d48c6 100644
--- a/library/std/src/sys/windows/handle.rs
+++ b/library/std/src/sys/windows/handle.rs
@@ -4,7 +4,7 @@
 mod tests;
 
 use crate::cmp;
-use crate::io::{self, ErrorKind, IoSlice, IoSliceMut, Read, ReadBuf};
+use crate::io::{self, BorrowedCursor, ErrorKind, IoSlice, IoSliceMut, Read};
 use crate::mem;
 use crate::os::windows::io::{
     AsHandle, AsRawHandle, BorrowedHandle, FromRawHandle, IntoRawHandle, OwnedHandle, RawHandle,
@@ -112,18 +112,16 @@ impl Handle {
         }
     }
 
-    pub fn read_buf(&self, buf: &mut ReadBuf<'_>) -> io::Result<()> {
-        let res = unsafe {
-            self.synchronous_read(buf.unfilled_mut().as_mut_ptr(), buf.remaining(), None)
-        };
+    pub fn read_buf(&self, mut cursor: BorrowedCursor<'_>) -> io::Result<()> {
+        let res =
+            unsafe { self.synchronous_read(cursor.as_mut().as_mut_ptr(), cursor.capacity(), None) };
 
         match res {
             Ok(read) => {
                 // Safety: `read` bytes were written to the initialized portion of the buffer
                 unsafe {
-                    buf.assume_init(read as usize);
+                    cursor.advance(read as usize);
                 }
-                buf.add_filled(read as usize);
                 Ok(())
             }
 
diff --git a/library/std/src/sys/windows/io.rs b/library/std/src/sys/windows/io.rs
index fb06df1f8..2cc34c986 100644
--- a/library/std/src/sys/windows/io.rs
+++ b/library/std/src/sys/windows/io.rs
@@ -1,6 +1,10 @@
 use crate::marker::PhantomData;
+use crate::mem::size_of;
+use crate::os::windows::io::{AsHandle, AsRawHandle, BorrowedHandle};
 use crate::slice;
-use crate::sys::c;
+use crate::sys::{c, Align8};
+use core;
+use libc;
 
 #[derive(Copy, Clone)]
 #[repr(transparent)]
@@ -78,3 +82,73 @@ impl<'a> IoSliceMut<'a> {
         unsafe { slice::from_raw_parts_mut(self.vec.buf as *mut u8, self.vec.len as usize) }
     }
 }
+
+pub fn is_terminal(h: &impl AsHandle) -> bool {
+    unsafe { handle_is_console(h.as_handle()) }
+}
+
+unsafe fn handle_is_console(handle: BorrowedHandle<'_>) -> bool {
+    let handle = handle.as_raw_handle();
+
+    // A null handle means the process has no console.
+    if handle.is_null() {
+        return false;
+    }
+
+    let mut out = 0;
+    if c::GetConsoleMode(handle, &mut out) != 0 {
+        // False positives aren't possible. If we got a console then we definitely have a console.
+        return true;
+    }
+
+    // At this point, we *could* have a false negative. We can determine that this is a true
+    // negative if we can detect the presence of a console on any of the standard I/O streams. If
+    // another stream has a console, then we know we're in a Windows console and can therefore
+    // trust the negative.
+    for std_handle in [c::STD_INPUT_HANDLE, c::STD_OUTPUT_HANDLE, c::STD_ERROR_HANDLE] {
+        let std_handle = c::GetStdHandle(std_handle);
+        if !std_handle.is_null()
+            && std_handle != handle
+            && c::GetConsoleMode(std_handle, &mut out) != 0
+        {
+            return false;
+        }
+    }
+
+    // Otherwise, we fall back to an msys hack to see if we can detect the presence of a pty.
+    msys_tty_on(handle)
+}
+
+unsafe fn msys_tty_on(handle: c::HANDLE) -> bool {
+    // Early return if the handle is not a pipe.
+    if c::GetFileType(handle) != c::FILE_TYPE_PIPE {
+        return false;
+    }
+
+    const SIZE: usize = size_of::<c::FILE_NAME_INFO>() + c::MAX_PATH * size_of::<c::WCHAR>();
+    let mut name_info_bytes = Align8([0u8; SIZE]);
+    let res = c::GetFileInformationByHandleEx(
+        handle,
+        c::FileNameInfo,
+        name_info_bytes.0.as_mut_ptr() as *mut libc::c_void,
+        SIZE as u32,
+    );
+    if res == 0 {
+        return false;
+    }
+    let name_info: &c::FILE_NAME_INFO = &*(name_info_bytes.0.as_ptr() as *const c::FILE_NAME_INFO);
+    let name_len = name_info.FileNameLength as usize / 2;
+    // Offset to get the `FileName` field.
+    let name_ptr = name_info_bytes.0.as_ptr().offset(size_of::<c::DWORD>() as isize).cast::<u16>();
+    let s = core::slice::from_raw_parts(name_ptr, name_len);
+    let name = String::from_utf16_lossy(s);
+    // Get the file name only.
+    let name = name.rsplit('\\').next().unwrap_or(&name);
+    // This checks whether 'pty' exists in the file name, which indicates that
+    // a pseudo-terminal is attached. To mitigate against false positives
+    // (e.g., an actual file name that contains 'pty'), we also require that
+    // the file name begins with either the strings 'msys-' or 'cygwin-'.)
+    let is_msys = name.starts_with("msys-") || name.starts_with("cygwin-");
+    let is_pty = name.contains("-pty");
+    is_msys && is_pty
+}
diff --git a/library/std/src/sys/windows/locks/mod.rs b/library/std/src/sys/windows/locks/mod.rs
index d412ff152..602a2d623 100644
--- a/library/std/src/sys/windows/locks/mod.rs
+++ b/library/std/src/sys/windows/locks/mod.rs
@@ -3,4 +3,4 @@ mod mutex;
 mod rwlock;
 pub use condvar::{Condvar, MovableCondvar};
 pub use mutex::{MovableMutex, Mutex};
-pub use rwlock::{MovableRwLock, RwLock};
+pub use rwlock::MovableRwLock;
diff --git a/library/std/src/sys/windows/locks/mutex.rs b/library/std/src/sys/windows/locks/mutex.rs
index f91e8f9f5..91207f5f4 100644
--- a/library/std/src/sys/windows/locks/mutex.rs
+++ b/library/std/src/sys/windows/locks/mutex.rs
@@ -37,8 +37,6 @@ impl Mutex {
     pub const fn new() -> Mutex {
         Mutex { srwlock: UnsafeCell::new(c::SRWLOCK_INIT) }
     }
-    #[inline]
-    pub unsafe fn init(&mut self) {}
 
     #[inline]
     pub unsafe fn lock(&self) {
diff --git a/library/std/src/sys/windows/mod.rs b/library/std/src/sys/windows/mod.rs
index b3f6d2d0a..eab9b9612 100644
--- a/library/std/src/sys/windows/mod.rs
+++ b/library/std/src/sys/windows/mod.rs
@@ -2,6 +2,7 @@
 
 use crate::ffi::{CStr, OsStr, OsString};
 use crate::io::ErrorKind;
+use crate::mem::MaybeUninit;
 use crate::os::windows::ffi::{OsStrExt, OsStringExt};
 use crate::path::PathBuf;
 use crate::time::Duration;
@@ -47,7 +48,7 @@ cfg_if::cfg_if! {
 
 // SAFETY: must be called only once during runtime initialization.
 // NOTE: this is not guaranteed to run, for example when Rust code is called externally.
-pub unsafe fn init(_argc: isize, _argv: *const *const u8) {
+pub unsafe fn init(_argc: isize, _argv: *const *const u8, _sigpipe: u8) {
     stack_overflow::init();
 
     // Normally, `thread::spawn` will call `Thread::set_name` but since this thread already
@@ -204,8 +205,8 @@ where
     // This initial size also works around `GetFullPathNameW` returning
     // incorrect size hints for some short paths:
     // https://github.com/dylni/normpath/issues/5
-    let mut stack_buf = [0u16; 512];
-    let mut heap_buf = Vec::new();
+    let mut stack_buf: [MaybeUninit<u16>; 512] = MaybeUninit::uninit_array();
+    let mut heap_buf: Vec<MaybeUninit<u16>> = Vec::new();
     unsafe {
         let mut n = stack_buf.len();
         loop {
@@ -214,6 +215,11 @@ where
             } else {
                 let extra = n - heap_buf.len();
                 heap_buf.reserve(extra);
+                // We used `reserve` and not `reserve_exact`, so in theory we
+                // may have gotten more than requested. If so, we'd like to use
+                // it... so long as we won't cause overflow.
+                n = heap_buf.capacity().min(c::DWORD::MAX as usize);
+                // Safety: MaybeUninit<u16> does not need initialization
                 heap_buf.set_len(n);
                 &mut heap_buf[..]
             };
@@ -228,13 +234,13 @@ where
             // error" is still 0 then we interpret it as a 0 length buffer and
             // not an actual error.
             c::SetLastError(0);
-            let k = match f1(buf.as_mut_ptr(), n as c::DWORD) {
+            let k = match f1(buf.as_mut_ptr().cast::<u16>(), n as c::DWORD) {
                 0 if c::GetLastError() == 0 => 0,
                 0 => return Err(crate::io::Error::last_os_error()),
                 n => n,
             } as usize;
             if k == n && c::GetLastError() == c::ERROR_INSUFFICIENT_BUFFER {
-                n *= 2;
+                n = n.saturating_mul(2).min(c::DWORD::MAX as usize);
             } else if k > n {
                 n = k;
             } else if k == n {
@@ -244,7 +250,9 @@ where
                 // Therefore k never equals n.
                 unreachable!();
             } else {
-                return Ok(f2(&buf[..k]));
+                // Safety: First `k` values are initialized.
+                let slice: &[u16] = MaybeUninit::slice_assume_init_ref(&buf[..k]);
+                return Ok(f2(slice));
             }
         }
     }
@@ -321,3 +329,11 @@ pub fn abort_internal() -> ! {
     }
     crate::intrinsics::abort();
 }
+
+/// Align the inner value to 8 bytes.
+///
+/// This is enough for almost all of the buffers we're likely to work with in
+/// the Windows APIs we use.
+#[repr(C, align(8))]
+#[derive(Copy, Clone)]
+pub(crate) struct Align8<T: ?Sized>(pub T);
diff --git a/library/std/src/sys/windows/os.rs b/library/std/src/sys/windows/os.rs
index bcac996c0..352337ba3 100644
--- a/library/std/src/sys/windows/os.rs
+++ b/library/std/src/sys/windows/os.rs
@@ -99,11 +99,11 @@ impl Iterator for Env {
                 }
                 let p = self.cur as *const u16;
                 let mut len = 0;
-                while *p.offset(len) != 0 {
+                while *p.add(len) != 0 {
                     len += 1;
                 }
-                let s = slice::from_raw_parts(p, len as usize);
-                self.cur = self.cur.offset(len + 1);
+                let s = slice::from_raw_parts(p, len);
+                self.cur = self.cur.add(len + 1);
 
                 // Windows allows environment variables to start with an equals
                 // symbol (in any other position, this is the separator between
diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs
index 11883f150..4bdd8c505 100644
--- a/library/std/src/sys/windows/os_str.rs
+++ b/library/std/src/sys/windows/os_str.rs
@@ -164,9 +164,7 @@ impl Slice {
     }
 
     pub fn to_owned(&self) -> Buf {
-        let mut buf = Wtf8Buf::with_capacity(self.inner.len());
-        buf.push_wtf8(&self.inner);
-        Buf { inner: buf }
+        Buf { inner: self.inner.to_owned() }
     }
 
     pub fn clone_into(&self, buf: &mut Buf) {
diff --git a/library/std/src/sys/windows/path/tests.rs b/library/std/src/sys/windows/path/tests.rs
index 6eab38cab..623c62361 100644
--- a/library/std/src/sys/windows/path/tests.rs
+++ b/library/std/src/sys/windows/path/tests.rs
@@ -105,7 +105,7 @@ fn test_parse_prefix_verbatim_device() {
     assert_eq!(prefix, parse_prefix(r"\\?/C:\windows\system32\notepad.exe"));
 }
 
-// See #93586 for more infomation.
+// See #93586 for more information.
 #[test]
 fn test_windows_prefix_components() {
     use crate::path::Path;
diff --git a/library/std/src/sys/windows/process.rs b/library/std/src/sys/windows/process.rs
index 02d5af471..9cbb4ef19 100644
--- a/library/std/src/sys/windows/process.rs
+++ b/library/std/src/sys/windows/process.rs
@@ -16,6 +16,7 @@ use crate::os::windows::ffi::{OsStrExt, OsStringExt};
 use crate::os::windows::io::{AsHandle, AsRawHandle, BorrowedHandle, FromRawHandle, IntoRawHandle};
 use crate::path::{Path, PathBuf};
 use crate::ptr;
+use crate::sync::Mutex;
 use crate::sys::args::{self, Arg};
 use crate::sys::c;
 use crate::sys::c::NonZeroDWORD;
@@ -25,7 +26,6 @@ use crate::sys::handle::Handle;
 use crate::sys::path;
 use crate::sys::pipe::{self, AnonPipe};
 use crate::sys::stdio;
-use crate::sys_common::mutex::StaticMutex;
 use crate::sys_common::process::{CommandEnv, CommandEnvs};
 use crate::sys_common::IntoInner;
 
@@ -301,9 +301,9 @@ impl Command {
         //
         // For more information, msdn also has an article about this race:
         // https://support.microsoft.com/kb/315939
-        static CREATE_PROCESS_LOCK: StaticMutex = StaticMutex::new();
+        static CREATE_PROCESS_LOCK: Mutex<()> = Mutex::new(());
 
-        let _guard = unsafe { CREATE_PROCESS_LOCK.lock() };
+        let _guard = CREATE_PROCESS_LOCK.lock();
 
         let mut pipes = StdioPipes { stdin: None, stdout: None, stderr: None };
         let null = Stdio::Null;
diff --git a/library/std/src/sys/windows/rand.rs b/library/std/src/sys/windows/rand.rs
index f8fd93a73..b5a49489d 100644
--- a/library/std/src/sys/windows/rand.rs
+++ b/library/std/src/sys/windows/rand.rs
@@ -1,35 +1,106 @@
-use crate::io;
+//! # Random key generation
+//!
+//! This module wraps the RNG provided by the OS. There are a few different
+//! ways to interface with the OS RNG so it's worth exploring each of the options.
+//! Note that at the time of writing these all go through the (undocumented)
+//! `bcryptPrimitives.dll` but they use different route to get there.
+//!
+//! Originally we were using [`RtlGenRandom`], however that function is
+//! deprecated and warns it "may be altered or unavailable in subsequent versions".
+//!
+//! So we switched to [`BCryptGenRandom`] with the `BCRYPT_USE_SYSTEM_PREFERRED_RNG`
+//! flag to query and find the system configured RNG. However, this change caused a small
+//! but significant number of users to experience panics caused by a failure of
+//! this function. See [#94098].
+//!
+//! The current version falls back to using `BCryptOpenAlgorithmProvider` if
+//! `BCRYPT_USE_SYSTEM_PREFERRED_RNG` fails for any reason.
+//!
+//! [#94098]: https://github.com/rust-lang/rust/issues/94098
+//! [`RtlGenRandom`]: https://docs.microsoft.com/en-us/windows/win32/api/ntsecapi/nf-ntsecapi-rtlgenrandom
+//! [`BCryptGenRandom`]: https://docs.microsoft.com/en-us/windows/win32/api/bcrypt/nf-bcrypt-bcryptgenrandom
 use crate::mem;
 use crate::ptr;
 use crate::sys::c;
 
+/// Generates high quality secure random keys for use by [`HashMap`].
+///
+/// This is used to seed the default [`RandomState`].
+///
+/// [`HashMap`]: crate::collections::HashMap
+/// [`RandomState`]: crate::collections::hash_map::RandomState
 pub fn hashmap_random_keys() -> (u64, u64) {
-    let mut v = (0, 0);
-    let ret = unsafe {
-        c::BCryptGenRandom(
-            ptr::null_mut(),
-            &mut v as *mut _ as *mut u8,
-            mem::size_of_val(&v) as c::ULONG,
-            c::BCRYPT_USE_SYSTEM_PREFERRED_RNG,
-        )
-    };
-    if ret != 0 { fallback_rng() } else { v }
+    Rng::SYSTEM.gen_random_keys().unwrap_or_else(fallback_rng)
 }
 
-/// Generate random numbers using the fallback RNG function (RtlGenRandom)
-#[cfg(not(target_vendor = "uwp"))]
-#[inline(never)]
-fn fallback_rng() -> (u64, u64) {
-    let mut v = (0, 0);
-    let ret =
-        unsafe { c::RtlGenRandom(&mut v as *mut _ as *mut u8, mem::size_of_val(&v) as c::ULONG) };
+struct Rng {
+    algorithm: c::BCRYPT_ALG_HANDLE,
+    flags: u32,
+}
+impl Rng {
+    const SYSTEM: Self = unsafe { Self::new(ptr::null_mut(), c::BCRYPT_USE_SYSTEM_PREFERRED_RNG) };
+
+    /// Create the RNG from an existing algorithm handle.
+    ///
+    /// # Safety
+    ///
+    /// The handle must either be null or a valid algorithm handle.
+    const unsafe fn new(algorithm: c::BCRYPT_ALG_HANDLE, flags: u32) -> Self {
+        Self { algorithm, flags }
+    }
+
+    /// Open a handle to the RNG algorithm.
+    fn open() -> Result<Self, c::NTSTATUS> {
+        use crate::sync::atomic::AtomicPtr;
+        use crate::sync::atomic::Ordering::{Acquire, Release};
+
+        // An atomic is used so we don't need to reopen the handle every time.
+        static HANDLE: AtomicPtr<crate::ffi::c_void> = AtomicPtr::new(ptr::null_mut());
+
+        let mut handle = HANDLE.load(Acquire);
+        if handle.is_null() {
+            let status = unsafe {
+                c::BCryptOpenAlgorithmProvider(
+                    &mut handle,
+                    c::BCRYPT_RNG_ALGORITHM.as_ptr(),
+                    ptr::null(),
+                    0,
+                )
+            };
+            if c::nt_success(status) {
+                // If another thread opens a handle first then use that handle instead.
+                let result = HANDLE.compare_exchange(ptr::null_mut(), handle, Release, Acquire);
+                if let Err(previous_handle) = result {
+                    // Close our handle and return the previous one.
+                    unsafe { c::BCryptCloseAlgorithmProvider(handle, 0) };
+                    handle = previous_handle;
+                }
+                Ok(unsafe { Self::new(handle, 0) })
+            } else {
+                Err(status)
+            }
+        } else {
+            Ok(unsafe { Self::new(handle, 0) })
+        }
+    }
 
-    if ret != 0 { v } else { panic!("fallback RNG broken: {}", io::Error::last_os_error()) }
+    fn gen_random_keys(self) -> Result<(u64, u64), c::NTSTATUS> {
+        let mut v = (0, 0);
+        let status = unsafe {
+            let size = mem::size_of_val(&v).try_into().unwrap();
+            c::BCryptGenRandom(self.algorithm, ptr::addr_of_mut!(v).cast(), size, self.flags)
+        };
+        if c::nt_success(status) { Ok(v) } else { Err(status) }
+    }
 }
 
-/// We can't use RtlGenRandom with UWP, so there is no fallback
-#[cfg(target_vendor = "uwp")]
+/// Generate random numbers using the fallback RNG function
 #[inline(never)]
-fn fallback_rng() -> (u64, u64) {
-    panic!("fallback RNG broken: RtlGenRandom() not supported on UWP");
+fn fallback_rng(rng_status: c::NTSTATUS) -> (u64, u64) {
+    match Rng::open().and_then(|rng| rng.gen_random_keys()) {
+        Ok(keys) => keys,
+        Err(status) => {
+            panic!("RNG broken: {rng_status:#x}, fallback RNG broken: {status:#x}")
+        }
+    }
 }
diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs
index a001d6b98..70c9b14a0 100644
--- a/library/std/src/sys/windows/stdio.rs
+++ b/library/std/src/sys/windows/stdio.rs
@@ -3,6 +3,7 @@
 use crate::char::decode_utf16;
 use crate::cmp;
 use crate::io;
+use crate::mem::MaybeUninit;
 use crate::os::windows::io::{FromRawHandle, IntoRawHandle};
 use crate::ptr;
 use crate::str;
@@ -169,13 +170,14 @@ fn write(
 }
 
 fn write_valid_utf8_to_console(handle: c::HANDLE, utf8: &str) -> io::Result<usize> {
-    let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2];
+    let mut utf16 = [MaybeUninit::<u16>::uninit(); MAX_BUFFER_SIZE / 2];
     let mut len_utf16 = 0;
     for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) {
-        *dest = chr;
+        *dest = MaybeUninit::new(chr);
         len_utf16 += 1;
     }
-    let utf16 = &utf16[..len_utf16];
+    // Safety: We've initialized `len_utf16` values.
+    let utf16: &[u16] = unsafe { MaybeUninit::slice_assume_init_ref(&utf16[..len_utf16]) };
 
     let mut written = write_u16s(handle, &utf16)?;
 
@@ -250,11 +252,14 @@ impl io::Read for Stdin {
             return Ok(bytes_copied);
         } else if buf.len() - bytes_copied < 4 {
             // Not enough space to get a UTF-8 byte. We will use the incomplete UTF8.
-            let mut utf16_buf = [0u16; 1];
+            let mut utf16_buf = [MaybeUninit::new(0); 1];
             // Read one u16 character.
             let read = read_u16s_fixup_surrogates(handle, &mut utf16_buf, 1, &mut self.surrogate)?;
             // Read bytes, using the (now-empty) self.incomplete_utf8 as extra space.
-            let read_bytes = utf16_to_utf8(&utf16_buf[..read], &mut self.incomplete_utf8.bytes)?;
+            let read_bytes = utf16_to_utf8(
+                unsafe { MaybeUninit::slice_assume_init_ref(&utf16_buf[..read]) },
+                &mut self.incomplete_utf8.bytes,
+            )?;
 
             // Read in the bytes from incomplete_utf8 until the buffer is full.
             self.incomplete_utf8.len = read_bytes as u8;
@@ -262,15 +267,18 @@ impl io::Read for Stdin {
             bytes_copied += self.incomplete_utf8.read(&mut buf[bytes_copied..]);
             Ok(bytes_copied)
         } else {
-            let mut utf16_buf = [0u16; MAX_BUFFER_SIZE / 2];
+            let mut utf16_buf = [MaybeUninit::<u16>::uninit(); MAX_BUFFER_SIZE / 2];
+
             // In the worst case, a UTF-8 string can take 3 bytes for every `u16` of a UTF-16. So
             // we can read at most a third of `buf.len()` chars and uphold the guarantee no data gets
             // lost.
             let amount = cmp::min(buf.len() / 3, utf16_buf.len());
             let read =
                 read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount, &mut self.surrogate)?;
-
-            match utf16_to_utf8(&utf16_buf[..read], buf) {
+            // Safety `read_u16s_fixup_surrogates` returns the number of items
+            // initialized.
+            let utf16s = unsafe { MaybeUninit::slice_assume_init_ref(&utf16_buf[..read]) };
+            match utf16_to_utf8(utf16s, buf) {
                 Ok(value) => return Ok(bytes_copied + value),
                 Err(e) => return Err(e),
             }
@@ -283,14 +291,14 @@ impl io::Read for Stdin {
 // This is a best effort, and might not work if we are not the only reader on Stdin.
 fn read_u16s_fixup_surrogates(
     handle: c::HANDLE,
-    buf: &mut [u16],
+    buf: &mut [MaybeUninit<u16>],
     mut amount: usize,
     surrogate: &mut u16,
 ) -> io::Result<usize> {
     // Insert possibly remaining unpaired surrogate from last read.
     let mut start = 0;
     if *surrogate != 0 {
-        buf[0] = *surrogate;
+        buf[0] = MaybeUninit::new(*surrogate);
         *surrogate = 0;
         start = 1;
         if amount == 1 {
@@ -303,7 +311,10 @@ fn read_u16s_fixup_surrogates(
     let mut amount = read_u16s(handle, &mut buf[start..amount])? + start;
 
     if amount > 0 {
-        let last_char = buf[amount - 1];
+        // Safety: The returned `amount` is the number of values initialized,
+        // and it is not 0, so we know that `buf[amount - 1]` have been
+        // initialized.
+        let last_char = unsafe { buf[amount - 1].assume_init() };
         if last_char >= 0xD800 && last_char <= 0xDBFF {
             // high surrogate
             *surrogate = last_char;
@@ -313,7 +324,8 @@ fn read_u16s_fixup_surrogates(
     Ok(amount)
 }
 
-fn read_u16s(handle: c::HANDLE, buf: &mut [u16]) -> io::Result<usize> {
+// Returns `Ok(n)` if it initialized `n` values in `buf`.
+fn read_u16s(handle: c::HANDLE, buf: &mut [MaybeUninit<u16>]) -> io::Result<usize> {
     // Configure the `pInputControl` parameter to not only return on `\r\n` but also Ctrl-Z, the
     // traditional DOS method to indicate end of character stream / user input (SUB).
     // See #38274 and https://stackoverflow.com/questions/43836040/win-api-readconsole.
@@ -346,8 +358,9 @@ fn read_u16s(handle: c::HANDLE, buf: &mut [u16]) -> io::Result<usize> {
         }
         break;
     }
-
-    if amount > 0 && buf[amount as usize - 1] == CTRL_Z {
+    // Safety: if `amount > 0`, then that many bytes were written, so
+    // `buf[amount as usize - 1]` has been initialized.
+    if amount > 0 && unsafe { buf[amount as usize - 1].assume_init() } == CTRL_Z {
         amount -= 1;
     }
     Ok(amount as usize)
diff --git a/library/std/src/sys/windows/thread_local_dtor.rs b/library/std/src/sys/windows/thread_local_dtor.rs
index 25d1c6e8e..9707a95df 100644
--- a/library/std/src/sys/windows/thread_local_dtor.rs
+++ b/library/std/src/sys/windows/thread_local_dtor.rs
@@ -8,10 +8,14 @@
 #[thread_local]
 static mut DESTRUCTORS: Vec<(*mut u8, unsafe extern "C" fn(*mut u8))> = Vec::new();
 
+// Ensure this can never be inlined because otherwise this may break in dylibs.
+// See #44391.
+#[inline(never)]
 pub unsafe fn register_dtor(t: *mut u8, dtor: unsafe extern "C" fn(*mut u8)) {
     DESTRUCTORS.push((t, dtor));
 }
 
+#[inline(never)] // See comment above
 /// Runs destructors. This should not be called until thread exit.
 pub unsafe fn run_keyless_dtors() {
     // Drop all the destructors.
diff --git a/library/std/src/sys/windows/thread_local_key.rs b/library/std/src/sys/windows/thread_local_key.rs
index ec670238e..17628b757 100644
--- a/library/std/src/sys/windows/thread_local_key.rs
+++ b/library/std/src/sys/windows/thread_local_key.rs
@@ -1,11 +1,16 @@
-use crate::mem::ManuallyDrop;
+use crate::cell::UnsafeCell;
 use crate::ptr;
-use crate::sync::atomic::AtomicPtr;
-use crate::sync::atomic::Ordering::SeqCst;
+use crate::sync::atomic::{
+    AtomicPtr, AtomicU32,
+    Ordering::{AcqRel, Acquire, Relaxed, Release},
+};
 use crate::sys::c;
 
-pub type Key = c::DWORD;
-pub type Dtor = unsafe extern "C" fn(*mut u8);
+#[cfg(test)]
+mod tests;
+
+type Key = c::DWORD;
+type Dtor = unsafe extern "C" fn(*mut u8);
 
 // Turns out, like pretty much everything, Windows is pretty close the
 // functionality that Unix provides, but slightly different! In the case of
@@ -22,60 +27,109 @@ pub type Dtor = unsafe extern "C" fn(*mut u8);
 // To accomplish this feat, we perform a number of threads, all contained
 // within this module:
 //
-// * All TLS destructors are tracked by *us*, not the windows runtime. This
+// * All TLS destructors are tracked by *us*, not the Windows runtime. This
 //   means that we have a global list of destructors for each TLS key that
 //   we know about.
 // * When a thread exits, we run over the entire list and run dtors for all
 //   non-null keys. This attempts to match Unix semantics in this regard.
 //
-// This ends up having the overhead of using a global list, having some
-// locks here and there, and in general just adding some more code bloat. We
-// attempt to optimize runtime by forgetting keys that don't have
-// destructors, but this only gets us so far.
-//
 // For more details and nitty-gritty, see the code sections below!
 //
 // [1]: https://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
-// [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base
-//                        /threading/thread_local_storage_win.cc#L42
+// [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base/threading/thread_local_storage_win.cc#L42
 
-// -------------------------------------------------------------------------
-// Native bindings
-//
-// This section is just raw bindings to the native functions that Windows
-// provides, There's a few extra calls to deal with destructors.
+pub struct StaticKey {
+    /// The key value shifted up by one. Since TLS_OUT_OF_INDEXES == DWORD::MAX
+    /// is not a valid key value, this allows us to use zero as sentinel value
+    /// without risking overflow.
+    key: AtomicU32,
+    dtor: Option<Dtor>,
+    next: AtomicPtr<StaticKey>,
+    /// Currently, destructors cannot be unregistered, so we cannot use racy
+    /// initialization for keys. Instead, we need synchronize initialization.
+    /// Use the Windows-provided `Once` since it does not require TLS.
+    once: UnsafeCell<c::INIT_ONCE>,
+}
 
-#[inline]
-pub unsafe fn create(dtor: Option<Dtor>) -> Key {
-    let key = c::TlsAlloc();
-    assert!(key != c::TLS_OUT_OF_INDEXES);
-    if let Some(f) = dtor {
-        register_dtor(key, f);
+impl StaticKey {
+    #[inline]
+    pub const fn new(dtor: Option<Dtor>) -> StaticKey {
+        StaticKey {
+            key: AtomicU32::new(0),
+            dtor,
+            next: AtomicPtr::new(ptr::null_mut()),
+            once: UnsafeCell::new(c::INIT_ONCE_STATIC_INIT),
+        }
     }
-    key
-}
 
-#[inline]
-pub unsafe fn set(key: Key, value: *mut u8) {
-    let r = c::TlsSetValue(key, value as c::LPVOID);
-    debug_assert!(r != 0);
-}
+    #[inline]
+    pub unsafe fn set(&'static self, val: *mut u8) {
+        let r = c::TlsSetValue(self.key(), val.cast());
+        debug_assert_eq!(r, c::TRUE);
+    }
 
-#[inline]
-pub unsafe fn get(key: Key) -> *mut u8 {
-    c::TlsGetValue(key) as *mut u8
-}
+    #[inline]
+    pub unsafe fn get(&'static self) -> *mut u8 {
+        c::TlsGetValue(self.key()).cast()
+    }
 
-#[inline]
-pub unsafe fn destroy(_key: Key) {
-    rtabort!("can't destroy tls keys on windows")
-}
+    #[inline]
+    unsafe fn key(&'static self) -> Key {
+        match self.key.load(Acquire) {
+            0 => self.init(),
+            key => key - 1,
+        }
+    }
+
+    #[cold]
+    unsafe fn init(&'static self) -> Key {
+        if self.dtor.is_some() {
+            let mut pending = c::FALSE;
+            let r = c::InitOnceBeginInitialize(self.once.get(), 0, &mut pending, ptr::null_mut());
+            assert_eq!(r, c::TRUE);
 
-#[inline]
-pub fn requires_synchronized_create() -> bool {
-    true
+            if pending == c::FALSE {
+                // Some other thread initialized the key, load it.
+                self.key.load(Relaxed) - 1
+            } else {
+                let key = c::TlsAlloc();
+                if key == c::TLS_OUT_OF_INDEXES {
+                    // Wakeup the waiting threads before panicking to avoid deadlock.
+                    c::InitOnceComplete(self.once.get(), c::INIT_ONCE_INIT_FAILED, ptr::null_mut());
+                    panic!("out of TLS indexes");
+                }
+
+                self.key.store(key + 1, Release);
+                register_dtor(self);
+
+                let r = c::InitOnceComplete(self.once.get(), 0, ptr::null_mut());
+                debug_assert_eq!(r, c::TRUE);
+
+                key
+            }
+        } else {
+            // If there is no destructor to clean up, we can use racy initialization.
+
+            let key = c::TlsAlloc();
+            assert_ne!(key, c::TLS_OUT_OF_INDEXES, "out of TLS indexes");
+
+            match self.key.compare_exchange(0, key + 1, AcqRel, Acquire) {
+                Ok(_) => key,
+                Err(new) => {
+                    // Some other thread completed initialization first, so destroy
+                    // our key and use theirs.
+                    let r = c::TlsFree(key);
+                    debug_assert_eq!(r, c::TRUE);
+                    new - 1
+                }
+            }
+        }
+    }
 }
 
+unsafe impl Send for StaticKey {}
+unsafe impl Sync for StaticKey {}
+
 // -------------------------------------------------------------------------
 // Dtor registration
 //
@@ -96,29 +150,21 @@ pub fn requires_synchronized_create() -> bool {
 // Typically processes have a statically known set of TLS keys which is pretty
 // small, and we'd want to keep this memory alive for the whole process anyway
 // really.
-//
-// Perhaps one day we can fold the `Box` here into a static allocation,
-// expanding the `StaticKey` structure to contain not only a slot for the TLS
-// key but also a slot for the destructor queue on windows. An optimization for
-// another day!
-
-static DTORS: AtomicPtr<Node> = AtomicPtr::new(ptr::null_mut());
-
-struct Node {
-    dtor: Dtor,
-    key: Key,
-    next: *mut Node,
-}
 
-unsafe fn register_dtor(key: Key, dtor: Dtor) {
-    let mut node = ManuallyDrop::new(Box::new(Node { key, dtor, next: ptr::null_mut() }));
+static DTORS: AtomicPtr<StaticKey> = AtomicPtr::new(ptr::null_mut());
 
-    let mut head = DTORS.load(SeqCst);
+/// Should only be called once per key, otherwise loops or breaks may occur in
+/// the linked list.
+unsafe fn register_dtor(key: &'static StaticKey) {
+    let this = <*const StaticKey>::cast_mut(key);
+    // Use acquire ordering to pass along the changes done by the previously
+    // registered keys when we store the new head with release ordering.
+    let mut head = DTORS.load(Acquire);
     loop {
-        node.next = head;
-        match DTORS.compare_exchange(head, &mut **node, SeqCst, SeqCst) {
-            Ok(_) => return, // nothing to drop, we successfully added the node to the list
-            Err(cur) => head = cur,
+        key.next.store(head, Relaxed);
+        match DTORS.compare_exchange_weak(head, this, Release, Acquire) {
+            Ok(_) => break,
+            Err(new) => head = new,
         }
     }
 }
@@ -214,25 +260,29 @@ unsafe extern "system" fn on_tls_callback(h: c::LPVOID, dwReason: c::DWORD, pv:
     unsafe fn reference_tls_used() {}
 }
 
-#[allow(dead_code)] // actually called above
+#[allow(dead_code)] // actually called below
 unsafe fn run_dtors() {
-    let mut any_run = true;
     for _ in 0..5 {
-        if !any_run {
-            break;
-        }
-        any_run = false;
-        let mut cur = DTORS.load(SeqCst);
+        let mut any_run = false;
+
+        // Use acquire ordering to observe key initialization.
+        let mut cur = DTORS.load(Acquire);
         while !cur.is_null() {
-            let ptr = c::TlsGetValue((*cur).key);
+            let key = (*cur).key.load(Relaxed) - 1;
+            let dtor = (*cur).dtor.unwrap();
 
+            let ptr = c::TlsGetValue(key);
             if !ptr.is_null() {
-                c::TlsSetValue((*cur).key, ptr::null_mut());
-                ((*cur).dtor)(ptr as *mut _);
+                c::TlsSetValue(key, ptr::null_mut());
+                dtor(ptr as *mut _);
                 any_run = true;
             }
 
-            cur = (*cur).next;
+            cur = (*cur).next.load(Relaxed);
+        }
+
+        if !any_run {
+            break;
         }
     }
 }
diff --git a/library/std/src/sys/windows/thread_local_key/tests.rs b/library/std/src/sys/windows/thread_local_key/tests.rs
new file mode 100644
index 000000000..c95f383fb
--- /dev/null
+++ b/library/std/src/sys/windows/thread_local_key/tests.rs
@@ -0,0 +1,53 @@
+use super::StaticKey;
+use crate::ptr;
+
+#[test]
+fn smoke() {
+    static K1: StaticKey = StaticKey::new(None);
+    static K2: StaticKey = StaticKey::new(None);
+
+    unsafe {
+        assert!(K1.get().is_null());
+        assert!(K2.get().is_null());
+        K1.set(ptr::invalid_mut(1));
+        K2.set(ptr::invalid_mut(2));
+        assert_eq!(K1.get() as usize, 1);
+        assert_eq!(K2.get() as usize, 2);
+    }
+}
+
+#[test]
+fn destructors() {
+    use crate::mem::ManuallyDrop;
+    use crate::sync::Arc;
+    use crate::thread;
+
+    unsafe extern "C" fn destruct(ptr: *mut u8) {
+        drop(Arc::from_raw(ptr as *const ()));
+    }
+
+    static KEY: StaticKey = StaticKey::new(Some(destruct));
+
+    let shared1 = Arc::new(());
+    let shared2 = Arc::clone(&shared1);
+
+    unsafe {
+        assert!(KEY.get().is_null());
+        KEY.set(Arc::into_raw(shared1) as *mut u8);
+    }
+
+    thread::spawn(move || unsafe {
+        assert!(KEY.get().is_null());
+        KEY.set(Arc::into_raw(shared2) as *mut u8);
+    })
+    .join()
+    .unwrap();
+
+    // Leak the Arc, let the TLS destructor clean it up.
+    let shared1 = unsafe { ManuallyDrop::new(Arc::from_raw(KEY.get() as *const ())) };
+    assert_eq!(
+        Arc::strong_count(&shared1),
+        1,
+        "destructor should have dropped the other reference on thread exit"
+    );
+}
diff --git a/library/std/src/sys/windows/thread_parker.rs b/library/std/src/sys/windows/thread_parker.rs
index d876e0f6f..2f7ae863b 100644
--- a/library/std/src/sys/windows/thread_parker.rs
+++ b/library/std/src/sys/windows/thread_parker.rs
@@ -197,19 +197,17 @@ impl Parker {
         // purpose, to make sure every unpark() has a release-acquire ordering
         // with park().
         if self.state.swap(NOTIFIED, Release) == PARKED {
-            if let Some(wake_by_address_single) = c::WakeByAddressSingle::option() {
-                unsafe {
+            unsafe {
+                if let Some(wake_by_address_single) = c::WakeByAddressSingle::option() {
                     wake_by_address_single(self.ptr());
-                }
-            } else {
-                // If we run NtReleaseKeyedEvent before the waiting thread runs
-                // NtWaitForKeyedEvent, this (shortly) blocks until we can wake it up.
-                // If the waiting thread wakes up before we run NtReleaseKeyedEvent
-                // (e.g. due to a timeout), this blocks until we do wake up a thread.
-                // To prevent this thread from blocking indefinitely in that case,
-                // park_impl() will, after seeing the state set to NOTIFIED after
-                // waking up, call NtWaitForKeyedEvent again to unblock us.
-                unsafe {
+                } else {
+                    // If we run NtReleaseKeyedEvent before the waiting thread runs
+                    // NtWaitForKeyedEvent, this (shortly) blocks until we can wake it up.
+                    // If the waiting thread wakes up before we run NtReleaseKeyedEvent
+                    // (e.g. due to a timeout), this blocks until we do wake up a thread.
+                    // To prevent this thread from blocking indefinitely in that case,
+                    // park_impl() will, after seeing the state set to NOTIFIED after
+                    // waking up, call NtWaitForKeyedEvent again to unblock us.
                     c::NtReleaseKeyedEvent(keyed_event_handle(), self.ptr(), 0, ptr::null_mut());
                 }
             }