use crate::cell::UnsafeCell; use crate::ptr; use crate::sync::atomic::{ AtomicPtr, AtomicU32, Ordering::{AcqRel, Acquire, Relaxed, Release}, }; use crate::sys::c; #[cfg(test)] mod tests; type Key = c::DWORD; type Dtor = unsafe extern "C" fn(*mut u8); // Turns out, like pretty much everything, Windows is pretty close the // functionality that Unix provides, but slightly different! In the case of // TLS, Windows does not provide an API to provide a destructor for a TLS // variable. This ends up being pretty crucial to this implementation, so we // need a way around this. // // The solution here ended up being a little obscure, but fear not, the // internet has informed me [1][2] that this solution is not unique (no way // I could have thought of it as well!). The key idea is to insert some hook // somewhere to run arbitrary code on thread termination. With this in place // we'll be able to run anything we like, including all TLS destructors! // // To accomplish this feat, we perform a number of threads, all contained // within this module: // // * All TLS destructors are tracked by *us*, not the Windows runtime. This // means that we have a global list of destructors for each TLS key that // we know about. // * When a thread exits, we run over the entire list and run dtors for all // non-null keys. This attempts to match Unix semantics in this regard. // // For more details and nitty-gritty, see the code sections below! // // [1]: https://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way // [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base/threading/thread_local_storage_win.cc#L42 pub struct StaticKey { /// The key value shifted up by one. Since TLS_OUT_OF_INDEXES == DWORD::MAX /// is not a valid key value, this allows us to use zero as sentinel value /// without risking overflow. key: AtomicU32, dtor: Option, next: AtomicPtr, /// Currently, destructors cannot be unregistered, so we cannot use racy /// initialization for keys. Instead, we need synchronize initialization. /// Use the Windows-provided `Once` since it does not require TLS. once: UnsafeCell, } impl StaticKey { #[inline] pub const fn new(dtor: Option) -> StaticKey { StaticKey { key: AtomicU32::new(0), dtor, next: AtomicPtr::new(ptr::null_mut()), once: UnsafeCell::new(c::INIT_ONCE_STATIC_INIT), } } #[inline] pub unsafe fn set(&'static self, val: *mut u8) { let r = c::TlsSetValue(self.key(), val.cast()); debug_assert_eq!(r, c::TRUE); } #[inline] pub unsafe fn get(&'static self) -> *mut u8 { c::TlsGetValue(self.key()).cast() } #[inline] unsafe fn key(&'static self) -> Key { match self.key.load(Acquire) { 0 => self.init(), key => key - 1, } } #[cold] unsafe fn init(&'static self) -> Key { if self.dtor.is_some() { let mut pending = c::FALSE; let r = c::InitOnceBeginInitialize(self.once.get(), 0, &mut pending, ptr::null_mut()); assert_eq!(r, c::TRUE); if pending == c::FALSE { // Some other thread initialized the key, load it. self.key.load(Relaxed) - 1 } else { let key = c::TlsAlloc(); if key == c::TLS_OUT_OF_INDEXES { // Wakeup the waiting threads before panicking to avoid deadlock. c::InitOnceComplete(self.once.get(), c::INIT_ONCE_INIT_FAILED, ptr::null_mut()); panic!("out of TLS indexes"); } self.key.store(key + 1, Release); register_dtor(self); let r = c::InitOnceComplete(self.once.get(), 0, ptr::null_mut()); debug_assert_eq!(r, c::TRUE); key } } else { // If there is no destructor to clean up, we can use racy initialization. let key = c::TlsAlloc(); assert_ne!(key, c::TLS_OUT_OF_INDEXES, "out of TLS indexes"); match self.key.compare_exchange(0, key + 1, AcqRel, Acquire) { Ok(_) => key, Err(new) => { // Some other thread completed initialization first, so destroy // our key and use theirs. let r = c::TlsFree(key); debug_assert_eq!(r, c::TRUE); new - 1 } } } } } unsafe impl Send for StaticKey {} unsafe impl Sync for StaticKey {} // ------------------------------------------------------------------------- // Dtor registration // // Windows has no native support for running destructors so we manage our own // list of destructors to keep track of how to destroy keys. We then install a // callback later to get invoked whenever a thread exits, running all // appropriate destructors. // // Currently unregistration from this list is not supported. A destructor can be // registered but cannot be unregistered. There's various simplifying reasons // for doing this, the big ones being: // // 1. Currently we don't even support deallocating TLS keys, so normal operation // doesn't need to deallocate a destructor. // 2. There is no point in time where we know we can unregister a destructor // because it could always be getting run by some remote thread. // // Typically processes have a statically known set of TLS keys which is pretty // small, and we'd want to keep this memory alive for the whole process anyway // really. static DTORS: AtomicPtr = AtomicPtr::new(ptr::null_mut()); /// Should only be called once per key, otherwise loops or breaks may occur in /// the linked list. unsafe fn register_dtor(key: &'static StaticKey) { let this = <*const StaticKey>::cast_mut(key); // Use acquire ordering to pass along the changes done by the previously // registered keys when we store the new head with release ordering. let mut head = DTORS.load(Acquire); loop { key.next.store(head, Relaxed); match DTORS.compare_exchange_weak(head, this, Release, Acquire) { Ok(_) => break, Err(new) => head = new, } } } // ------------------------------------------------------------------------- // Where the Magic (TM) Happens // // If you're looking at this code, and wondering "what is this doing?", // you're not alone! I'll try to break this down step by step: // // # What's up with CRT$XLB? // // For anything about TLS destructors to work on Windows, we have to be able // to run *something* when a thread exits. To do so, we place a very special // static in a very special location. If this is encoded in just the right // way, the kernel's loader is apparently nice enough to run some function // of ours whenever a thread exits! How nice of the kernel! // // Lots of detailed information can be found in source [1] above, but the // gist of it is that this is leveraging a feature of Microsoft's PE format // (executable format) which is not actually used by any compilers today. // This apparently translates to any callbacks in the ".CRT$XLB" section // being run on certain events. // // So after all that, we use the compiler's #[link_section] feature to place // a callback pointer into the magic section so it ends up being called. // // # What's up with this callback? // // The callback specified receives a number of parameters from... someone! // (the kernel? the runtime? I'm not quite sure!) There are a few events that // this gets invoked for, but we're currently only interested on when a // thread or a process "detaches" (exits). The process part happens for the // last thread and the thread part happens for any normal thread. // // # Ok, what's up with running all these destructors? // // This will likely need to be improved over time, but this function // attempts a "poor man's" destructor callback system. Once we've got a list // of what to run, we iterate over all keys, check their values, and then run // destructors if the values turn out to be non null (setting them to null just // beforehand). We do this a few times in a loop to basically match Unix // semantics. If we don't reach a fixed point after a short while then we just // inevitably leak something most likely. // // # The article mentions weird stuff about "/INCLUDE"? // // It sure does! Specifically we're talking about this quote: // // The Microsoft run-time library facilitates this process by defining a // memory image of the TLS Directory and giving it the special name // “__tls_used” (Intel x86 platforms) or “_tls_used” (other platforms). The // linker looks for this memory image and uses the data there to create the // TLS Directory. Other compilers that support TLS and work with the // Microsoft linker must use this same technique. // // Basically what this means is that if we want support for our TLS // destructors/our hook being called then we need to make sure the linker does // not omit this symbol. Otherwise it will omit it and our callback won't be // wired up. // // We don't actually use the `/INCLUDE` linker flag here like the article // mentions because the Rust compiler doesn't propagate linker flags, but // instead we use a shim function which performs a volatile 1-byte load from // the address of the symbol to ensure it sticks around. #[link_section = ".CRT$XLB"] #[allow(dead_code, unused_variables)] #[used] // we don't want LLVM eliminating this symbol for any reason, and // when the symbol makes it to the linker the linker will take over pub static p_thread_callback: unsafe extern "system" fn(c::LPVOID, c::DWORD, c::LPVOID) = on_tls_callback; #[allow(dead_code, unused_variables)] unsafe extern "system" fn on_tls_callback(h: c::LPVOID, dwReason: c::DWORD, pv: c::LPVOID) { if dwReason == c::DLL_THREAD_DETACH || dwReason == c::DLL_PROCESS_DETACH { run_dtors(); #[cfg(target_thread_local)] super::thread_local_dtor::run_keyless_dtors(); } // See comments above for what this is doing. Note that we don't need this // trickery on GNU windows, just on MSVC. reference_tls_used(); #[cfg(target_env = "msvc")] unsafe fn reference_tls_used() { extern "C" { static _tls_used: u8; } crate::intrinsics::volatile_load(&_tls_used); } #[cfg(not(target_env = "msvc"))] unsafe fn reference_tls_used() {} } #[allow(dead_code)] // actually called below unsafe fn run_dtors() { for _ in 0..5 { let mut any_run = false; // Use acquire ordering to observe key initialization. let mut cur = DTORS.load(Acquire); while !cur.is_null() { let key = (*cur).key.load(Relaxed) - 1; let dtor = (*cur).dtor.unwrap(); let ptr = c::TlsGetValue(key); if !ptr.is_null() { c::TlsSetValue(key, ptr::null_mut()); dtor(ptr as *mut _); any_run = true; } cur = (*cur).next.load(Relaxed); } if !any_run { break; } } }