#ifndef JEMALLOC_INTERNAL_TSD_H #define JEMALLOC_INTERNAL_TSD_H #include "jemalloc/internal/activity_callback.h" #include "jemalloc/internal/arena_types.h" #include "jemalloc/internal/assert.h" #include "jemalloc/internal/bin_types.h" #include "jemalloc/internal/jemalloc_internal_externs.h" #include "jemalloc/internal/peak.h" #include "jemalloc/internal/prof_types.h" #include "jemalloc/internal/ql.h" #include "jemalloc/internal/rtree_tsd.h" #include "jemalloc/internal/tcache_types.h" #include "jemalloc/internal/tcache_structs.h" #include "jemalloc/internal/util.h" #include "jemalloc/internal/witness.h" /* * Thread-Specific-Data layout * * At least some thread-local data gets touched on the fast-path of almost all * malloc operations. But much of it is only necessary down slow-paths, or * testing. We want to colocate the fast-path data so that it can live on the * same cacheline if possible. So we define three tiers of hotness: * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths. * TSD_DATA_SLOW: Touched down slow paths. "Slow" here is sort of general; * there are "semi-slow" paths like "not a sized deallocation, but can still * live in the tcache". We'll want to keep these closer to the fast-path * data. * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all. * * An additional concern is that the larger tcache bins won't be used (we have a * bin per size class, but by default only cache relatively small objects). So * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the * TSD_DATA_SLOWER tier. * * As a result of all this, we put the slow data first, then the fast data, then * the slower data, while keeping the tcache as the last element of the fast * data (so that the fast -> slower transition happens midway through the * tcache). While we don't yet play alignment tricks to guarantee it, this * increases our odds of getting some cache/page locality on fast paths. */ #ifdef JEMALLOC_JET typedef void (*test_callback_t)(int *); # define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10 # define MALLOC_TEST_TSD \ O(test_data, int, int) \ O(test_callback, test_callback_t, int) # define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL #else # define MALLOC_TEST_TSD # define MALLOC_TEST_TSD_INITIALIZER #endif typedef ql_elm(tsd_t) tsd_link_t; /* O(name, type, nullable type) */ #define TSD_DATA_SLOW \ O(tcache_enabled, bool, bool) \ O(reentrancy_level, int8_t, int8_t) \ O(thread_allocated_last_event, uint64_t, uint64_t) \ O(thread_allocated_next_event, uint64_t, uint64_t) \ O(thread_deallocated_last_event, uint64_t, uint64_t) \ O(thread_deallocated_next_event, uint64_t, uint64_t) \ O(tcache_gc_event_wait, uint64_t, uint64_t) \ O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \ O(prof_sample_event_wait, uint64_t, uint64_t) \ O(prof_sample_last_event, uint64_t, uint64_t) \ O(stats_interval_event_wait, uint64_t, uint64_t) \ O(stats_interval_last_event, uint64_t, uint64_t) \ O(peak_alloc_event_wait, uint64_t, uint64_t) \ O(peak_dalloc_event_wait, uint64_t, uint64_t) \ O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ O(prng_state, uint64_t, uint64_t) \ O(san_extents_until_guard_small, uint64_t, uint64_t) \ O(san_extents_until_guard_large, uint64_t, uint64_t) \ O(iarena, arena_t *, arena_t *) \ O(arena, arena_t *, arena_t *) \ O(arena_decay_ticker, ticker_geom_t, ticker_geom_t) \ O(sec_shard, uint8_t, uint8_t) \ O(binshards, tsd_binshards_t, tsd_binshards_t)\ O(tsd_link, tsd_link_t, tsd_link_t) \ O(in_hook, bool, bool) \ O(peak, peak_t, peak_t) \ O(activity_callback_thunk, activity_callback_thunk_t, \ activity_callback_thunk_t) \ O(tcache_slow, tcache_slow_t, tcache_slow_t) \ O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) #define TSD_DATA_SLOW_INITIALIZER \ /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \ /* reentrancy_level */ 0, \ /* thread_allocated_last_event */ 0, \ /* thread_allocated_next_event */ 0, \ /* thread_deallocated_last_event */ 0, \ /* thread_deallocated_next_event */ 0, \ /* tcache_gc_event_wait */ 0, \ /* tcache_gc_dalloc_event_wait */ 0, \ /* prof_sample_event_wait */ 0, \ /* prof_sample_last_event */ 0, \ /* stats_interval_event_wait */ 0, \ /* stats_interval_last_event */ 0, \ /* peak_alloc_event_wait */ 0, \ /* peak_dalloc_event_wait */ 0, \ /* prof_tdata */ NULL, \ /* prng_state */ 0, \ /* san_extents_until_guard_small */ 0, \ /* san_extents_until_guard_large */ 0, \ /* iarena */ NULL, \ /* arena */ NULL, \ /* arena_decay_ticker */ \ TICKER_GEOM_INIT(ARENA_DECAY_NTICKS_PER_UPDATE), \ /* sec_shard */ (uint8_t)-1, \ /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \ /* tsd_link */ {NULL}, \ /* in_hook */ false, \ /* peak */ PEAK_INITIALIZER, \ /* activity_callback_thunk */ \ ACTIVITY_CALLBACK_THUNK_INITIALIZER, \ /* tcache_slow */ TCACHE_SLOW_ZERO_INITIALIZER, \ /* rtree_ctx */ RTREE_CTX_INITIALIZER, /* O(name, type, nullable type) */ #define TSD_DATA_FAST \ O(thread_allocated, uint64_t, uint64_t) \ O(thread_allocated_next_event_fast, uint64_t, uint64_t) \ O(thread_deallocated, uint64_t, uint64_t) \ O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \ O(tcache, tcache_t, tcache_t) #define TSD_DATA_FAST_INITIALIZER \ /* thread_allocated */ 0, \ /* thread_allocated_next_event_fast */ 0, \ /* thread_deallocated */ 0, \ /* thread_deallocated_next_event_fast */ 0, \ /* tcache */ TCACHE_ZERO_INITIALIZER, /* O(name, type, nullable type) */ #define TSD_DATA_SLOWER \ O(witness_tsd, witness_tsd_t, witness_tsdn_t) \ MALLOC_TEST_TSD #define TSD_DATA_SLOWER_INITIALIZER \ /* witness */ WITNESS_TSD_INITIALIZER \ /* test data */ MALLOC_TEST_TSD_INITIALIZER #define TSD_INITIALIZER { \ TSD_DATA_SLOW_INITIALIZER \ /* state */ ATOMIC_INIT(tsd_state_uninitialized), \ TSD_DATA_FAST_INITIALIZER \ TSD_DATA_SLOWER_INITIALIZER \ } #if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32) void _malloc_tsd_cleanup_register(bool (*f)(void)); #endif void *malloc_tsd_malloc(size_t size); void malloc_tsd_dalloc(void *wrapper); tsd_t *malloc_tsd_boot0(void); void malloc_tsd_boot1(void); void tsd_cleanup(void *arg); tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal); void tsd_state_set(tsd_t *tsd, uint8_t new_state); void tsd_slow_update(tsd_t *tsd); void tsd_prefork(tsd_t *tsd); void tsd_postfork_parent(tsd_t *tsd); void tsd_postfork_child(tsd_t *tsd); /* * Call ..._inc when your module wants to take all threads down the slow paths, * and ..._dec when it no longer needs to. */ void tsd_global_slow_inc(tsdn_t *tsdn); void tsd_global_slow_dec(tsdn_t *tsdn); bool tsd_global_slow(); enum { /* Common case --> jnz. */ tsd_state_nominal = 0, /* Initialized but on slow path. */ tsd_state_nominal_slow = 1, /* * Some thread has changed global state in such a way that all nominal * threads need to recompute their fast / slow status the next time they * get a chance. * * Any thread can change another thread's status *to* recompute, but * threads are the only ones who can change their status *from* * recompute. */ tsd_state_nominal_recompute = 2, /* * The above nominal states should be lower values. We use * tsd_nominal_max to separate nominal states from threads in the * process of being born / dying. */ tsd_state_nominal_max = 2, /* * A thread might free() during its death as its only allocator action; * in such scenarios, we need tsd, but set up in such a way that no * cleanup is necessary. */ tsd_state_minimal_initialized = 3, /* States during which we know we're in thread death. */ tsd_state_purgatory = 4, tsd_state_reincarnated = 5, /* * What it says on the tin; tsd that hasn't been initialized. Note * that even when the tsd struct lives in TLS, when need to keep track * of stuff like whether or not our pthread destructors have been * scheduled, so this really truly is different than the nominal state. */ tsd_state_uninitialized = 6 }; /* * Some TSD accesses can only be done in a nominal state. To enforce this, we * wrap TSD member access in a function that asserts on TSD state, and mangle * field names to prevent touching them accidentally. */ #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n #ifdef JEMALLOC_U8_ATOMICS # define tsd_state_t atomic_u8_t # define tsd_atomic_load atomic_load_u8 # define tsd_atomic_store atomic_store_u8 # define tsd_atomic_exchange atomic_exchange_u8 #else # define tsd_state_t atomic_u32_t # define tsd_atomic_load atomic_load_u32 # define tsd_atomic_store atomic_store_u32 # define tsd_atomic_exchange atomic_exchange_u32 #endif /* The actual tsd. */ struct tsd_s { /* * The contents should be treated as totally opaque outside the tsd * module. Access any thread-local state through the getters and * setters below. */ #define O(n, t, nt) \ t TSD_MANGLE(n); TSD_DATA_SLOW /* * We manually limit the state to just a single byte. Unless the 8-bit * atomics are unavailable (which is rare). */ tsd_state_t state; TSD_DATA_FAST TSD_DATA_SLOWER #undef O }; JEMALLOC_ALWAYS_INLINE uint8_t tsd_state_get(tsd_t *tsd) { /* * This should be atomic. Unfortunately, compilers right now can't tell * that this can be done as a memory comparison, and forces a load into * a register that hurts fast-path performance. */ /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */ return *(uint8_t *)&tsd->state; } /* * Wrapper around tsd_t that makes it possible to avoid implicit conversion * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be * explicitly converted to tsd_t, which is non-nullable. */ struct tsdn_s { tsd_t tsd; }; #define TSDN_NULL ((tsdn_t *)0) JEMALLOC_ALWAYS_INLINE tsdn_t * tsd_tsdn(tsd_t *tsd) { return (tsdn_t *)tsd; } JEMALLOC_ALWAYS_INLINE bool tsdn_null(const tsdn_t *tsdn) { return tsdn == NULL; } JEMALLOC_ALWAYS_INLINE tsd_t * tsdn_tsd(tsdn_t *tsdn) { assert(!tsdn_null(tsdn)); return &tsdn->tsd; } /* * We put the platform-specific data declarations and inlines into their own * header files to avoid cluttering this file. They define tsd_boot0, * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set. */ #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP #include "jemalloc/internal/tsd_malloc_thread_cleanup.h" #elif (defined(JEMALLOC_TLS)) #include "jemalloc/internal/tsd_tls.h" #elif (defined(_WIN32)) #include "jemalloc/internal/tsd_win.h" #else #include "jemalloc/internal/tsd_generic.h" #endif /* * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of * foo. This omits some safety checks, and so can be used during tsd * initialization and cleanup. */ #define O(n, t, nt) \ JEMALLOC_ALWAYS_INLINE t * \ tsd_##n##p_get_unsafe(tsd_t *tsd) { \ return &tsd->TSD_MANGLE(n); \ } TSD_DATA_SLOW TSD_DATA_FAST TSD_DATA_SLOWER #undef O /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */ #define O(n, t, nt) \ JEMALLOC_ALWAYS_INLINE t * \ tsd_##n##p_get(tsd_t *tsd) { \ /* \ * Because the state might change asynchronously if it's \ * nominal, we need to make sure that we only read it once. \ */ \ uint8_t state = tsd_state_get(tsd); \ assert(state == tsd_state_nominal || \ state == tsd_state_nominal_slow || \ state == tsd_state_nominal_recompute || \ state == tsd_state_reincarnated || \ state == tsd_state_minimal_initialized); \ return tsd_##n##p_get_unsafe(tsd); \ } TSD_DATA_SLOW TSD_DATA_FAST TSD_DATA_SLOWER #undef O /* * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type. */ #define O(n, t, nt) \ JEMALLOC_ALWAYS_INLINE nt * \ tsdn_##n##p_get(tsdn_t *tsdn) { \ if (tsdn_null(tsdn)) { \ return NULL; \ } \ tsd_t *tsd = tsdn_tsd(tsdn); \ return (nt *)tsd_##n##p_get(tsd); \ } TSD_DATA_SLOW TSD_DATA_FAST TSD_DATA_SLOWER #undef O /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */ #define O(n, t, nt) \ JEMALLOC_ALWAYS_INLINE t \ tsd_##n##_get(tsd_t *tsd) { \ return *tsd_##n##p_get(tsd); \ } TSD_DATA_SLOW TSD_DATA_FAST TSD_DATA_SLOWER #undef O /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */ #define O(n, t, nt) \ JEMALLOC_ALWAYS_INLINE void \ tsd_##n##_set(tsd_t *tsd, t val) { \ assert(tsd_state_get(tsd) != tsd_state_reincarnated && \ tsd_state_get(tsd) != tsd_state_minimal_initialized); \ *tsd_##n##p_get(tsd) = val; \ } TSD_DATA_SLOW TSD_DATA_FAST TSD_DATA_SLOWER #undef O JEMALLOC_ALWAYS_INLINE void tsd_assert_fast(tsd_t *tsd) { /* * Note that our fastness assertion does *not* include global slowness * counters; it's not in general possible to ensure that they won't * change asynchronously from underneath us. */ assert(!malloc_slow && tsd_tcache_enabled_get(tsd) && tsd_reentrancy_level_get(tsd) == 0); } JEMALLOC_ALWAYS_INLINE bool tsd_fast(tsd_t *tsd) { bool fast = (tsd_state_get(tsd) == tsd_state_nominal); if (fast) { tsd_assert_fast(tsd); } return fast; } JEMALLOC_ALWAYS_INLINE tsd_t * tsd_fetch_impl(bool init, bool minimal) { tsd_t *tsd = tsd_get(init); if (!init && tsd_get_allocates() && tsd == NULL) { return NULL; } assert(tsd != NULL); if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) { return tsd_fetch_slow(tsd, minimal); } assert(tsd_fast(tsd)); tsd_assert_fast(tsd); return tsd; } /* Get a minimal TSD that requires no cleanup. See comments in free(). */ JEMALLOC_ALWAYS_INLINE tsd_t * tsd_fetch_min(void) { return tsd_fetch_impl(true, true); } /* For internal background threads use only. */ JEMALLOC_ALWAYS_INLINE tsd_t * tsd_internal_fetch(void) { tsd_t *tsd = tsd_fetch_min(); /* Use reincarnated state to prevent full initialization. */ tsd_state_set(tsd, tsd_state_reincarnated); return tsd; } JEMALLOC_ALWAYS_INLINE tsd_t * tsd_fetch(void) { return tsd_fetch_impl(true, false); } static inline bool tsd_nominal(tsd_t *tsd) { bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max; assert(nominal || tsd_reentrancy_level_get(tsd) > 0); return nominal; } JEMALLOC_ALWAYS_INLINE tsdn_t * tsdn_fetch(void) { if (!tsd_booted_get()) { return NULL; } return tsd_tsdn(tsd_fetch_impl(false, false)); } JEMALLOC_ALWAYS_INLINE rtree_ctx_t * tsd_rtree_ctx(tsd_t *tsd) { return tsd_rtree_ctxp_get(tsd); } JEMALLOC_ALWAYS_INLINE rtree_ctx_t * tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) { /* * If tsd cannot be accessed, initialize the fallback rtree_ctx and * return a pointer to it. */ if (unlikely(tsdn_null(tsdn))) { rtree_ctx_data_init(fallback); return fallback; } return tsd_rtree_ctx(tsdn_tsd(tsdn)); } static inline bool tsd_state_nocleanup(tsd_t *tsd) { return tsd_state_get(tsd) == tsd_state_reincarnated || tsd_state_get(tsd) == tsd_state_minimal_initialized; } /* * These "raw" tsd reentrancy functions don't have any debug checking to make * sure that we're not touching arena 0. Better is to call pre_reentrancy and * post_reentrancy if this is possible. */ static inline void tsd_pre_reentrancy_raw(tsd_t *tsd) { bool fast = tsd_fast(tsd); assert(tsd_reentrancy_level_get(tsd) < INT8_MAX); ++*tsd_reentrancy_levelp_get(tsd); if (fast) { /* Prepare slow path for reentrancy. */ tsd_slow_update(tsd); assert(tsd_state_get(tsd) == tsd_state_nominal_slow); } } static inline void tsd_post_reentrancy_raw(tsd_t *tsd) { int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd); assert(*reentrancy_level > 0); if (--*reentrancy_level == 0) { tsd_slow_update(tsd); } } #endif /* JEMALLOC_INTERNAL_TSD_H */