From 34996e42f82bfd60bc2c191e5cae3c6ab233ec6c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 7 Aug 2024 15:11:27 +0200 Subject: Merging upstream version 6.9.7. Signed-off-by: Daniel Baumann --- mm/zswap.c | 2093 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 1013 insertions(+), 1080 deletions(-) (limited to 'mm/zswap.c') diff --git a/mm/zswap.c b/mm/zswap.c index 69766f2c5a..6f8850c44b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -71,8 +71,6 @@ static u64 zswap_reject_compress_poor; static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; -/* Duplicate store was encountered (rare) */ -static u64 zswap_duplicate_entry; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; @@ -141,10 +139,6 @@ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); -static bool zswap_exclusive_loads_enabled = IS_ENABLED( - CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); -module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); - /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 @@ -168,6 +162,7 @@ struct crypto_acomp_ctx { struct crypto_wait wait; u8 *buffer; struct mutex mutex; + bool is_sleepable; }; /* @@ -179,18 +174,24 @@ struct crypto_acomp_ctx { struct zswap_pool { struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; - struct kref kref; + struct percpu_ref ref; struct list_head list; struct work_struct release_work; - struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; - struct list_lru list_lru; - struct mem_cgroup *next_shrink; - struct shrinker *shrinker; - atomic_t nr_stored; }; +/* Global LRU lists shared by all zswap pools. */ +static struct list_lru zswap_list_lru; +/* counter of pages stored in all zswap pools. */ +static atomic_t zswap_nr_stored = ATOMIC_INIT(0); + +/* The lock protects zswap_next_shrink updates. */ +static DEFINE_SPINLOCK(zswap_shrink_lock); +static struct mem_cgroup *zswap_next_shrink; +static struct work_struct zswap_shrink_work; +static struct shrinker *zswap_shrinker; + /* * struct zswap_entry * @@ -199,12 +200,6 @@ struct zswap_pool { * * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree - * refcount - the number of outstanding reference to the entry. This is needed - * to protect against premature freeing of the entry by code - * concurrent calls to load, invalidate, and writeback. The lock - * for the zswap_tree structure that contains the entry must - * be held while changing the refcount. Since the lock must - * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. @@ -217,7 +212,6 @@ struct zswap_pool { struct zswap_entry { struct rb_node rbnode; swp_entry_t swpentry; - int refcount; unsigned int length; struct zswap_pool *pool; union { @@ -228,17 +222,13 @@ struct zswap_entry { struct list_head lru; }; -/* - * The tree lock in the zswap_tree struct protects a few things: - * - the rbtree - * - the refcount field of each entry in the tree - */ struct zswap_tree { struct rb_root rbroot; spinlock_t lock; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); @@ -265,15 +255,16 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ +static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +{ + return &zswap_trees[swp_type(swp)][swp_offset(swp) + >> SWAP_ADDRESS_SPACE_SHIFT]; +} + #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) -static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree); -static int zswap_pool_get(struct zswap_pool *pool); -static void zswap_pool_put(struct zswap_pool *pool); - static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < @@ -313,400 +304,631 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } -/* should be called under RCU */ -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +/********************************* +* pool functions +**********************************/ +static void __zswap_pool_empty(struct percpu_ref *ref); + +static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { - return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; + int i; + struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + int ret; + + if (!zswap_has_pool) { + /* if either are unset, pool initialization failed, and we + * need both params to be set correctly before trying to + * create a pool. + */ + if (!strcmp(type, ZSWAP_PARAM_UNSET)) + return NULL; + if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) + return NULL; + } + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", + atomic_inc_return(&zswap_pools_count)); + + pool->zpools[i] = zpool_create_pool(type, name, gfp); + if (!pool->zpools[i]) { + pr_err("%s zpool not available\n", type); + goto error; + } + } + pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); + + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + + pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + if (!pool->acomp_ctx) { + pr_err("percpu alloc failed\n"); + goto error; + } + + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, + &pool->node); + if (ret) + goto error; + + /* being the current pool takes 1 ref; this func expects the + * caller to always add the new pool as the current pool + */ + ret = percpu_ref_init(&pool->ref, __zswap_pool_empty, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (ret) + goto ref_fail; + INIT_LIST_HEAD(&pool->list); + + zswap_pool_debug("created", pool); + + return pool; + +ref_fail: + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); +error: + if (pool->acomp_ctx) + free_percpu(pool->acomp_ctx); + while (i--) + zpool_destroy_pool(pool->zpools[i]); + kfree(pool); + return NULL; } -#else -static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) + +static struct zswap_pool *__zswap_pool_create_fallback(void) { - return NULL; + bool has_comp, has_zpool; + + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + if (!has_comp && strcmp(zswap_compressor, + CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("compressor %s not available, using default %s\n", + zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); + param_free_charp(&zswap_compressor); + zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + } + if (!has_comp) { + pr_err("default compressor %s not available\n", + zswap_compressor); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_PARAM_UNSET; + } + + has_zpool = zpool_has_pool(zswap_zpool_type); + if (!has_zpool && strcmp(zswap_zpool_type, + CONFIG_ZSWAP_ZPOOL_DEFAULT)) { + pr_err("zpool %s not available, using default %s\n", + zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; + has_zpool = zpool_has_pool(zswap_zpool_type); + } + if (!has_zpool) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_PARAM_UNSET; + } + + if (!has_comp || !has_zpool) + return NULL; + + return zswap_pool_create(zswap_zpool_type, zswap_compressor); } -#endif -static inline int entry_to_nid(struct zswap_entry *entry) +static void zswap_pool_destroy(struct zswap_pool *pool) { - return page_to_nid(virt_to_page(entry)); + int i; + + zswap_pool_debug("destroying", pool); + + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + free_percpu(pool->acomp_ctx); + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + zpool_destroy_pool(pool->zpools[i]); + kfree(pool); } -void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +static void __zswap_pool_release(struct work_struct *work) { - struct zswap_pool *pool; + struct zswap_pool *pool = container_of(work, typeof(*pool), + release_work); - /* lock out zswap pools list modification */ - spin_lock(&zswap_pools_lock); - list_for_each_entry(pool, &zswap_pools, list) { - if (pool->next_shrink == memcg) - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - } - spin_unlock(&zswap_pools_lock); + synchronize_rcu(); + + /* nobody should have been able to get a ref... */ + WARN_ON(!percpu_ref_is_zero(&pool->ref)); + percpu_ref_exit(&pool->ref); + + /* pool is now off zswap_pools list and has no references. */ + zswap_pool_destroy(pool); } -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; +static struct zswap_pool *zswap_pool_current(void); -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +static void __zswap_pool_empty(struct percpu_ref *ref) { - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; + struct zswap_pool *pool; + + pool = container_of(ref, typeof(*pool), ref); + + spin_lock_bh(&zswap_pools_lock); + + WARN_ON(pool == zswap_pool_current()); + + list_del_rcu(&pool->list); + + INIT_WORK(&pool->release_work, __zswap_pool_release); + schedule_work(&pool->release_work); + + spin_unlock_bh(&zswap_pools_lock); } -static void zswap_entry_cache_free(struct zswap_entry *entry) +static int __must_check zswap_pool_get(struct zswap_pool *pool) { - kmem_cache_free(zswap_entry_cache, entry); + if (!pool) + return 0; + + return percpu_ref_tryget(&pool->ref); } -/********************************* -* zswap lruvec functions -**********************************/ -void zswap_lruvec_state_init(struct lruvec *lruvec) +static void zswap_pool_put(struct zswap_pool *pool) { - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); + percpu_ref_put(&pool->ref); } -void zswap_folio_swapin(struct folio *folio) +static struct zswap_pool *__zswap_pool_current(void) { - struct lruvec *lruvec; + struct zswap_pool *pool; - VM_WARN_ON_ONCE(!folio_test_locked(folio)); - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); + WARN_ONCE(!pool && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + + return pool; } -/********************************* -* lru functions -**********************************/ -static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +static struct zswap_pool *zswap_pool_current(void) { - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; - int nid = entry_to_nid(entry); - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - /* - * Note that it is safe to use rcu_read_lock() here, even in the face of - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection - * used in list_lru lookup, only two scenarios are possible: - * - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The - * new entry will be reparented to memcg's parent's list_lru. - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The - * new entry will be added directly to memcg's parent's list_lru. - * - * Similar reasoning holds for list_lru_del() and list_lru_putback(). - */ - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - /* will always succeed */ - list_lru_add(list_lru, &entry->lru, nid, memcg); + assert_spin_locked(&zswap_pools_lock); - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); - rcu_read_unlock(); + return __zswap_pool_current(); } -static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +static struct zswap_pool *zswap_pool_current_get(void) { - int nid = entry_to_nid(entry); - struct mem_cgroup *memcg; + struct zswap_pool *pool; rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - /* will always succeed */ - list_lru_del(list_lru, &entry->lru, nid, memcg); + + pool = __zswap_pool_current(); + if (!zswap_pool_get(pool)) + pool = NULL; + rcu_read_unlock(); + + return pool; } -static void zswap_lru_putback(struct list_lru *list_lru, - struct zswap_entry *entry) +/* type and compressor must be null-terminated */ +static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) { - int nid = entry_to_nid(entry); - spinlock_t *lock = &list_lru->node[nid].lock; - struct mem_cgroup *memcg; - struct lruvec *lruvec; + struct zswap_pool *pool; - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - spin_lock(lock); - /* we cannot use list_lru_add here, because it increments node's lru count */ - list_lru_putback(list_lru, &entry->lru, nid, memcg); - spin_unlock(lock); + assert_spin_locked(&zswap_pools_lock); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); - /* increment the protection area to account for the LRU rotation. */ - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - rcu_read_unlock(); + list_for_each_entry_rcu(pool, &zswap_pools, list) { + if (strcmp(pool->tfm_name, compressor)) + continue; + /* all zpools share the same type */ + if (strcmp(zpool_get_type(pool->zpools[0]), type)) + continue; + /* if we can't get it, it's about to be destroyed */ + if (!zswap_pool_get(pool)) + continue; + return pool; + } + + return NULL; } /********************************* -* rbtree functions +* param callbacks **********************************/ -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) -{ - struct rb_node *node = root->rb_node; - struct zswap_entry *entry; - pgoff_t entry_offset; - while (node) { - entry = rb_entry(node, struct zswap_entry, rbnode); - entry_offset = swp_offset(entry->swpentry); - if (entry_offset > offset) - node = node->rb_left; - else if (entry_offset < offset) - node = node->rb_right; - else - return entry; - } - return NULL; +static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) +{ + /* no change required */ + if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) + return false; + return true; } -/* - * In the case that a entry with the same offset is found, a pointer to - * the existing entry is stored in dupentry and the function returns -EEXIST - */ -static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, - struct zswap_entry **dupentry) +/* val must be a null-terminated string */ +static int __zswap_param_set(const char *val, const struct kernel_param *kp, + char *type, char *compressor) { - struct rb_node **link = &root->rb_node, *parent = NULL; - struct zswap_entry *myentry; - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); + struct zswap_pool *pool, *put_pool = NULL; + char *s = strstrip((char *)val); + int ret = 0; + bool new_pool = false; - while (*link) { - parent = *link; - myentry = rb_entry(parent, struct zswap_entry, rbnode); - myentry_offset = swp_offset(myentry->swpentry); - if (myentry_offset > entry_offset) - link = &(*link)->rb_left; - else if (myentry_offset < entry_offset) - link = &(*link)->rb_right; - else { - *dupentry = myentry; - return -EEXIST; - } + mutex_lock(&zswap_init_lock); + switch (zswap_init_state) { + case ZSWAP_UNINIT: + /* if this is load-time (pre-init) param setting, + * don't create a pool; that's done during init. + */ + ret = param_set_charp(s, kp); + break; + case ZSWAP_INIT_SUCCEED: + new_pool = zswap_pool_changed(s, kp); + break; + case ZSWAP_INIT_FAILED: + pr_err("can't set param, initialization failed\n"); + ret = -ENODEV; } - rb_link_node(&entry->rbnode, parent, link); - rb_insert_color(&entry->rbnode, root); - return 0; -} + mutex_unlock(&zswap_init_lock); -static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) -{ - if (!RB_EMPTY_NODE(&entry->rbnode)) { - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); - return true; + /* no need to create a new pool, return directly */ + if (!new_pool) + return ret; + + if (!type) { + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); + return -ENOENT; + } + type = s; + } else if (!compressor) { + if (!crypto_has_acomp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); + return -ENOENT; + } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; } - return false; -} -static struct zpool *zswap_find_zpool(struct zswap_entry *entry) -{ - int i = 0; + spin_lock_bh(&zswap_pools_lock); - if (ZSWAP_NR_ZPOOLS > 1) - i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); + pool = zswap_pool_find_get(type, compressor); + if (pool) { + zswap_pool_debug("using existing", pool); + WARN_ON(pool == zswap_pool_current()); + list_del_rcu(&pool->list); + } - return entry->pool->zpools[i]; -} + spin_unlock_bh(&zswap_pools_lock); -/* - * Carries out the common pattern of freeing and entry's zpool allocation, - * freeing the entry itself, and decrementing the number of stored pages. - */ -static void zswap_free_entry(struct zswap_entry *entry) -{ - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); + if (!pool) + pool = zswap_pool_create(type, compressor); else { - zswap_lru_del(&entry->pool->list_lru, entry); - zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&entry->pool->nr_stored); - zswap_pool_put(entry->pool); + /* + * Restore the initial ref dropped by percpu_ref_kill() + * when the pool was decommissioned and switch it again + * to percpu mode. + */ + percpu_ref_resurrect(&pool->ref); + + /* Drop the ref from zswap_pool_find_get(). */ + zswap_pool_put(pool); } - if (entry->objcg) { - obj_cgroup_uncharge_zswap(entry->objcg, entry->length); - obj_cgroup_put(entry->objcg); + + if (pool) + ret = param_set_charp(s, kp); + else + ret = -EINVAL; + + spin_lock_bh(&zswap_pools_lock); + + if (!ret) { + put_pool = zswap_pool_current(); + list_add_rcu(&pool->list, &zswap_pools); + zswap_has_pool = true; + } else if (pool) { + /* add the possibly pre-existing pool to the end of the pools + * list; if it's new (and empty) then it'll be removed and + * destroyed by the put after we drop the lock + */ + list_add_tail_rcu(&pool->list, &zswap_pools); + put_pool = pool; } - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - zswap_update_total_size(); + + spin_unlock_bh(&zswap_pools_lock); + + if (!zswap_has_pool && !pool) { + /* if initial pool creation failed, and this pool creation also + * failed, maybe both compressor and zpool params were bad. + * Allow changing this param, so pool creation will succeed + * when the other param is changed. We already verified this + * param is ok in the zpool_has_pool() or crypto_has_acomp() + * checks above. + */ + ret = param_set_charp(s, kp); + } + + /* drop the ref from either the old current pool, + * or the new pool we failed to add + */ + if (put_pool) + percpu_ref_kill(&put_pool->ref); + + return ret; } -/* caller must hold the tree lock */ -static void zswap_entry_get(struct zswap_entry *entry) +static int zswap_compressor_param_set(const char *val, + const struct kernel_param *kp) { - entry->refcount++; + return __zswap_param_set(val, kp, zswap_zpool_type, NULL); } -/* caller must hold the tree lock -* remove from the tree and free it, if nobody reference the entry -*/ -static void zswap_entry_put(struct zswap_tree *tree, - struct zswap_entry *entry) +static int zswap_zpool_param_set(const char *val, + const struct kernel_param *kp) { - int refcount = --entry->refcount; - - WARN_ON_ONCE(refcount < 0); - if (refcount == 0) { - WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); - zswap_free_entry(entry); - } + return __zswap_param_set(val, kp, NULL, zswap_compressor); } -/* caller must hold the tree lock */ -static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, - pgoff_t offset) +static int zswap_enabled_param_set(const char *val, + const struct kernel_param *kp) { - struct zswap_entry *entry; + int ret = -ENODEV; - entry = zswap_rb_search(root, offset); - if (entry) - zswap_entry_get(entry); + /* if this is load-time (pre-init) param setting, only set param. */ + if (system_state != SYSTEM_RUNNING) + return param_set_bool(val, kp); - return entry; + mutex_lock(&zswap_init_lock); + switch (zswap_init_state) { + case ZSWAP_UNINIT: + if (zswap_setup()) + break; + fallthrough; + case ZSWAP_INIT_SUCCEED: + if (!zswap_has_pool) + pr_err("can't enable, no pool configured\n"); + else + ret = param_set_bool(val, kp); + break; + case ZSWAP_INIT_FAILED: + pr_err("can't enable, initialization failed\n"); + } + mutex_unlock(&zswap_init_lock); + + return ret; } /********************************* -* shrinker functions +* lru functions **********************************/ -static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg); -static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, - struct shrink_control *sc) +/* should be called under RCU */ +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; - struct zswap_pool *pool = shrinker->private_data; - bool encountered_page_in_swapcache = false; - - if (!zswap_shrinker_enabled || - !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } + return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return NULL; +} +#endif - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&pool->list_lru, sc); +static inline int entry_to_nid(struct zswap_entry *entry) +{ + return page_to_nid(virt_to_page(entry)); +} - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). +static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +{ + atomic_long_t *nr_zswap_protected; + unsigned long lru_size, old, new; + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + /* + * Note that it is safe to use rcu_read_lock() here, even in the face of + * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection + * used in list_lru lookup, only two scenarios are possible: + * + * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * new entry will be reparented to memcg's parent's list_lru. + * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * new entry will be added directly to memcg's parent's list_lru. + * + * Similar reasoning holds for list_lru_del(). */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_add(list_lru, &entry->lru, nid, memcg); - shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, - &encountered_page_in_swapcache); + /* Update the protection area */ + lru_size = list_lru_count_one(list_lru, nid, memcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; + old = atomic_long_inc_return(nr_zswap_protected); + /* + * Decay to avoid overflow and adapt to changing workloads. + * This is based on LRU reclaim cost decaying heuristics. + */ + do { + new = old > lru_size / 4 ? old / 2 : old; + } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); + rcu_read_unlock(); +} - if (encountered_page_in_swapcache) - return SHRINK_STOP; +static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; - return shrink_ret ? shrink_ret : SHRINK_STOP; + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_del(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); } -static unsigned long zswap_shrinker_count(struct shrinker *shrinker, - struct shrink_control *sc) +void zswap_lruvec_state_init(struct lruvec *lruvec) { - struct zswap_pool *pool = shrinker->private_data; - struct mem_cgroup *memcg = sc->memcg; - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} - if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) - return 0; +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; - /* - * The shrinker resumes swap writeback, which will enter block - * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS - * rules (may_enter_fs()), which apply on a per-folio basis. - */ - if (!gfp_has_io_fs(sc->gfp_mask)) - return 0; + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} - /* - * For memcg, use the cgroup-wide ZSWAP stats since we don't - * have them per-node and thus per-lruvec. Careful if memcg is - * runtime-disabled: we can get sc->memcg == NULL, which is ok - * for the lruvec, but not for memcg_page_state(). - * - * Without memcg, use the zswap pool-wide metrics. - */ - if (!mem_cgroup_disabled()) { - mem_cgroup_flush_stats(memcg); - nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; - nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); - } else { - nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; - nr_stored = atomic_read(&pool->nr_stored); +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + /* lock out zswap shrinker walking memcg tree */ + spin_lock(&zswap_shrink_lock); + if (zswap_next_shrink == memcg) + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + spin_unlock(&zswap_shrink_lock); +} + +/********************************* +* rbtree functions +**********************************/ +static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) +{ + struct rb_node *node = root->rb_node; + struct zswap_entry *entry; + pgoff_t entry_offset; + + while (node) { + entry = rb_entry(node, struct zswap_entry, rbnode); + entry_offset = swp_offset(entry->swpentry); + if (entry_offset > offset) + node = node->rb_left; + else if (entry_offset < offset) + node = node->rb_right; + else + return entry; } + return NULL; +} - if (!nr_stored) - return 0; +/* + * In the case that a entry with the same offset is found, a pointer to + * the existing entry is stored in dupentry and the function returns -EEXIST + */ +static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, + struct zswap_entry **dupentry) +{ + struct rb_node **link = &root->rb_node, *parent = NULL; + struct zswap_entry *myentry; + pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); - /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. - */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + while (*link) { + parent = *link; + myentry = rb_entry(parent, struct zswap_entry, rbnode); + myentry_offset = swp_offset(myentry->swpentry); + if (myentry_offset > entry_offset) + link = &(*link)->rb_left; + else if (myentry_offset < entry_offset) + link = &(*link)->rb_right; + else { + *dupentry = myentry; + return -EEXIST; + } + } + rb_link_node(&entry->rbnode, parent, link); + rb_insert_color(&entry->rbnode, root); + return 0; +} - /* - * Scale the number of freeable pages by the memory saving factor. - * This ensures that the better zswap compresses memory, the fewer - * pages we will evict to swap (as it will otherwise incur IO for - * relatively small memory saving). - */ - return mult_frac(nr_freeable, nr_backing, nr_stored); +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); } -static void zswap_alloc_shrinker(struct zswap_pool *pool) +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { - pool->shrinker = - shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); - if (!pool->shrinker) - return; + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + +static struct zpool *zswap_find_zpool(struct zswap_entry *entry) +{ + int i = 0; + + if (ZSWAP_NR_ZPOOLS > 1) + i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); + + return entry->pool->zpools[i]; +} - pool->shrinker->private_data = pool; - pool->shrinker->scan_objects = zswap_shrinker_scan; - pool->shrinker->count_objects = zswap_shrinker_count; - pool->shrinker->batch = 0; - pool->shrinker->seeks = DEFAULT_SEEKS; +/* + * Carries out the common pattern of freeing and entry's zpool allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_entry_free(struct zswap_entry *entry) +{ + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zswap_lru_del(&zswap_list_lru, entry); + zpool_free(zswap_find_zpool(entry), entry->handle); + atomic_dec(&zswap_nr_stored); + zswap_pool_put(entry->pool); + } + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_update_total_size(); +} + +/* + * The caller hold the tree lock and search the entry from the tree, + * so it must be on the tree, remove it from the tree and free it. + */ +static void zswap_invalidate_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + zswap_rb_erase(&tree->rbroot, entry); + zswap_entry_free(entry); } /********************************* -* per-cpu code +* compressed storage functions **********************************/ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { @@ -730,6 +952,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) goto acomp_fail; } acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); req = acomp_request_alloc(acomp_ctx->acomp); if (!req) { @@ -774,134 +997,262 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -/********************************* -* pool functions -**********************************/ - -static struct zswap_pool *__zswap_pool_current(void) +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { - struct zswap_pool *pool; - - pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); - WARN_ONCE(!pool && zswap_has_pool, - "%s: no page storage pool!\n", __func__); + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + int comp_ret = 0, alloc_ret = 0; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + u8 *dst; - return pool; -} + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); -static struct zswap_pool *zswap_pool_current(void) -{ - assert_spin_locked(&zswap_pools_lock); + mutex_lock(&acomp_ctx->mutex); - return __zswap_pool_current(); -} + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); -static struct zswap_pool *zswap_pool_current_get(void) -{ - struct zswap_pool *pool; + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - rcu_read_lock(); + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (comp_ret) + goto unlock; - pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) - pool = NULL; + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (alloc_ret) + goto unlock; - rcu_read_unlock(); + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); - return pool; + entry->handle = handle; + entry->length = dlen; + +unlock: + if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) + zswap_reject_compress_poor++; + else if (comp_ret) + zswap_reject_compress_fail++; + else if (alloc_ret) + zswap_reject_alloc_fail++; + + mutex_unlock(&acomp_ctx->mutex); + return comp_ret == 0 && alloc_ret == 0; } -static struct zswap_pool *zswap_pool_last_get(void) +static void zswap_decompress(struct zswap_entry *entry, struct page *page) { - struct zswap_pool *pool, *last = NULL; + struct zpool *zpool = zswap_find_zpool(entry); + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + u8 *src; - rcu_read_lock(); + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); - list_for_each_entry_rcu(pool, &zswap_pools, list) - last = pool; - WARN_ONCE(!last && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - if (!zswap_pool_get(last)) - last = NULL; + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + /* + * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer + * to do crypto_acomp_decompress() which might sleep. In such cases, we must + * resort to copying the buffer to a temporary one. + * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, + * such as a kmap address of high memory or even ever a vmap address. + * However, sg_init_one is only equipped to handle linearly mapped low memory. + * In such cases, we also must copy the buffer to a temporary and lowmem one. + */ + if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || + !virt_addr_valid(src)) { + memcpy(acomp_ctx->buffer, src, entry->length); + src = acomp_ctx->buffer; + zpool_unmap_handle(zpool, entry->handle); + } - rcu_read_unlock(); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); + BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); + BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); + mutex_unlock(&acomp_ctx->mutex); - return last; + if (src != acomp_ctx->buffer) + zpool_unmap_handle(zpool, entry->handle); } -/* type and compressor must be null-terminated */ -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +/********************************* +* writeback code +**********************************/ +/* + * Attempts to free an entry by adding a folio to the swap cache, + * decompressing the entry data into the folio, and issuing a + * bio write to write the folio back to the swap device. + * + * This can be thought of as a "resumed writeback" of the folio + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the zswap_store() + * in the first place. After the folio has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zswap_entry *entry, + swp_entry_t swpentry) { - struct zswap_pool *pool; + struct zswap_tree *tree; + struct folio *folio; + struct mempolicy *mpol; + bool folio_was_allocated; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; - assert_spin_locked(&zswap_pools_lock); + /* try to allocate swap cache folio */ + mpol = get_task_policy(current); + folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + if (!folio) + return -ENOMEM; - list_for_each_entry_rcu(pool, &zswap_pools, list) { - if (strcmp(pool->tfm_name, compressor)) - continue; - /* all zpools share the same type */ - if (strcmp(zpool_get_type(pool->zpools[0]), type)) - continue; - /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) - continue; - return pool; + /* + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. + */ + if (!folio_was_allocated) { + folio_put(folio); + return -EEXIST; } - return NULL; -} + /* + * folio is locked, and the swapcache is now secured against + * concurrent swapping to and from the slot, and concurrent + * swapoff so we can safely dereference the zswap tree here. + * Verify that the swap entry hasn't been invalidated and recycled + * behind our backs, to avoid overwriting a new swap folio with + * old compressed data. Only when this is successful can the entry + * be dereferenced. + */ + tree = swap_zswap_tree(swpentry); + spin_lock(&tree->lock); + if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { + spin_unlock(&tree->lock); + delete_from_swap_cache(folio); + folio_unlock(folio); + folio_put(folio); + return -ENOMEM; + } -/* - * If the entry is still valid in the tree, drop the initial ref and remove it - * from the tree. This function must be called with an additional ref held, - * otherwise it may race with another invalidation freeing the entry. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(tree, entry); + /* Safe to deref entry after the entry is verified above. */ + zswap_rb_erase(&tree->rbroot, entry); + spin_unlock(&tree->lock); + + zswap_decompress(entry, &folio->page); + + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + zswap_entry_free(entry); + + /* folio is up to date */ + folio_mark_uptodate(folio); + + /* move it to the tail of the inactive list after end_writeback */ + folio_set_reclaim(folio); + + /* start writeback */ + __swap_writepage(folio, &wbc); + folio_put(folio); + + return 0; } +/********************************* +* shrinker functions +**********************************/ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; - struct zswap_tree *tree; - pgoff_t swpoffset; + swp_entry_t swpentry; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; + /* + * As soon as we drop the LRU lock, the entry can be freed by + * a concurrent invalidation. This means the following: + * + * 1. We extract the swp_entry_t to the stack, allowing + * zswap_writeback_entry() to pin the swap entry and + * then validate the zwap entry against that swap entry's + * tree using pointer value comparison. Only when that + * is successful can the entry be dereferenced. + * + * 2. Usually, objects are taken off the LRU for reclaim. In + * this case this isn't possible, because if reclaim fails + * for whatever reason, we have no means of knowing if the + * entry is alive to put it back on the LRU. + * + * So rotate it before dropping the lock. If the entry is + * written back or invalidated, the free path will unlink + * it. For failures, rotation is the right thing as well. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + */ + list_move_tail(item, &l->list); + /* * Once the lru lock is dropped, the entry might get freed. The - * swpoffset is copied to the stack, and entry isn't deref'd again + * swpentry is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree. */ - swpoffset = swp_offset(entry->swpentry); - tree = zswap_trees[swp_type(entry->swpentry)]; - list_lru_isolate(l, item); + swpentry = entry->swpentry; + /* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY or LRU_RETRY. */ spin_unlock(lock); - /* Check for invalidate() race */ - spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) - goto unlock; - - /* Hold a reference to prevent a free during writeback */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); - - writeback_result = zswap_writeback_entry(entry, tree); + writeback_result = zswap_writeback_entry(entry, swpentry); - spin_lock(&tree->lock); if (writeback_result) { zswap_reject_reclaim_fail++; - zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; /* @@ -909,570 +1260,209 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_STOP; *encountered_page_in_swapcache = true; - - goto put_unlock; + } + } else { + zswap_written_back_pages++; } - zswap_written_back_pages++; - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - - count_vm_event(ZSWPWB); - /* - * Writeback started successfully, the page now belongs to the - * swapcache. Drop the entry from zswap - unless invalidate already - * took it out while we had the tree->lock released for IO. - */ - zswap_invalidate_entry(tree, entry); - -put_unlock: - /* Drop local reference */ - zswap_entry_put(tree, entry); -unlock: - spin_unlock(&tree->lock); spin_lock(lock); return ret; } -static int shrink_memcg(struct mem_cgroup *memcg) +static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) { - struct zswap_pool *pool; - int nid, shrunk = 0; + struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + unsigned long shrink_ret, nr_protected, lru_size; + bool encountered_page_in_swapcache = false; - if (!mem_cgroup_zswap_writeback_enabled(memcg)) - return -EINVAL; + if (!zswap_shrinker_enabled || + !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + lru_size = list_lru_shrink_count(&zswap_list_lru, sc); /* - * Skip zombies because their LRUs are reparented and we would be - * reclaiming from the parent instead of the dead memcg. + * Abort if we are shrinking into the protected region. + * + * This short-circuiting is necessary because if we have too many multiple + * concurrent reclaimers getting the freeable zswap object counts at the + * same time (before any of them made reasonable progress), the total + * number of reclaimed objects might be more than the number of unprotected + * objects (i.e the reclaimers will reclaim into the protected area of the + * zswap LRU). */ - if (memcg && !mem_cgroup_online(memcg)) - return -ENOENT; + if (nr_protected >= lru_size - sc->nr_to_scan) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } - pool = zswap_pool_current_get(); - if (!pool) - return -EINVAL; + shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, + &encountered_page_in_swapcache); - for_each_node_state(nid, N_NORMAL_MEMORY) { - unsigned long nr_to_walk = 1; + if (encountered_page_in_swapcache) + return SHRINK_STOP; - shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, - &shrink_memcg_cb, NULL, &nr_to_walk); - } - zswap_pool_put(pool); - return shrunk ? 0 : -EAGAIN; + return shrink_ret ? shrink_ret : SHRINK_STOP; } -static void shrink_worker(struct work_struct *w) +static unsigned long zswap_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) { - struct zswap_pool *pool = container_of(w, typeof(*pool), - shrink_work); - struct mem_cgroup *memcg; - int ret, failures = 0; + struct mem_cgroup *memcg = sc->memcg; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); + unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; - /* global reclaim will select cgroup in a round-robin fashion. */ - do { - spin_lock(&zswap_pools_lock); - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - memcg = pool->next_shrink; + if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) + return 0; - /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. - * - * Note that if we got an online memcg, we will keep the extra - * reference in case the original reference obtained by mem_cgroup_iter - * is dropped by the zswap memcg offlining callback, ensuring that the - * memcg is not killed when we are reclaiming. - */ - if (!memcg) { - spin_unlock(&zswap_pools_lock); - if (++failures == MAX_RECLAIM_RETRIES) - break; + /* + * The shrinker resumes swap writeback, which will enter block + * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS + * rules (may_enter_fs()), which apply on a per-folio basis. + */ + if (!gfp_has_io_fs(sc->gfp_mask)) + return 0; - goto resched; - } - - if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); - - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - spin_unlock(&zswap_pools_lock); - - ret = shrink_memcg(memcg); - /* drop the extra reference */ - mem_cgroup_put(memcg); - - if (ret == -EINVAL) - break; - if (ret && ++failures == MAX_RECLAIM_RETRIES) - break; - -resched: - cond_resched(); - } while (!zswap_can_accept()); - zswap_pool_put(pool); -} - -static struct zswap_pool *zswap_pool_create(char *type, char *compressor) -{ - int i; - struct zswap_pool *pool; - char name[38]; /* 'zswap' + 32 char (max) num + \0 */ - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; - - if (!zswap_has_pool) { - /* if either are unset, pool initialization failed, and we - * need both params to be set correctly before trying to - * create a pool. - */ - if (!strcmp(type, ZSWAP_PARAM_UNSET)) - return NULL; - if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) - return NULL; - } - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) - return NULL; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { - /* unique name for each pool specifically required by zsmalloc */ - snprintf(name, 38, "zswap%x", - atomic_inc_return(&zswap_pools_count)); - - pool->zpools[i] = zpool_create_pool(type, name, gfp); - if (!pool->zpools[i]) { - pr_err("%s zpool not available\n", type); - goto error; - } - } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); - - strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - - pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); - if (!pool->acomp_ctx) { - pr_err("percpu alloc failed\n"); - goto error; - } - - ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, - &pool->node); - if (ret) - goto error; - - zswap_alloc_shrinker(pool); - if (!pool->shrinker) - goto error; - - pr_debug("using %s compressor\n", pool->tfm_name); - - /* being the current pool takes 1 ref; this func expects the - * caller to always add the new pool as the current pool + /* + * For memcg, use the cgroup-wide ZSWAP stats since we don't + * have them per-node and thus per-lruvec. Careful if memcg is + * runtime-disabled: we can get sc->memcg == NULL, which is ok + * for the lruvec, but not for memcg_page_state(). + * + * Without memcg, use the zswap pool-wide metrics. */ - kref_init(&pool->kref); - INIT_LIST_HEAD(&pool->list); - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) - goto lru_fail; - shrinker_register(pool->shrinker); - INIT_WORK(&pool->shrink_work, shrink_worker); - atomic_set(&pool->nr_stored, 0); - - zswap_pool_debug("created", pool); - - return pool; - -lru_fail: - list_lru_destroy(&pool->list_lru); - shrinker_free(pool->shrinker); -error: - if (pool->acomp_ctx) - free_percpu(pool->acomp_ctx); - while (i--) - zpool_destroy_pool(pool->zpools[i]); - kfree(pool); - return NULL; -} - -static struct zswap_pool *__zswap_pool_create_fallback(void) -{ - bool has_comp, has_zpool; - - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - if (!has_comp && strcmp(zswap_compressor, - CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { - pr_err("compressor %s not available, using default %s\n", - zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); - param_free_charp(&zswap_compressor); - zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - } - if (!has_comp) { - pr_err("default compressor %s not available\n", - zswap_compressor); - param_free_charp(&zswap_compressor); - zswap_compressor = ZSWAP_PARAM_UNSET; - } - - has_zpool = zpool_has_pool(zswap_zpool_type); - if (!has_zpool && strcmp(zswap_zpool_type, - CONFIG_ZSWAP_ZPOOL_DEFAULT)) { - pr_err("zpool %s not available, using default %s\n", - zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; - has_zpool = zpool_has_pool(zswap_zpool_type); - } - if (!has_zpool) { - pr_err("default zpool %s not available\n", - zswap_zpool_type); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = ZSWAP_PARAM_UNSET; - } - - if (!has_comp || !has_zpool) - return NULL; - - return zswap_pool_create(zswap_zpool_type, zswap_compressor); -} - -static void zswap_pool_destroy(struct zswap_pool *pool) -{ - int i; - - zswap_pool_debug("destroying", pool); - - shrinker_free(pool->shrinker); - cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); - free_percpu(pool->acomp_ctx); - list_lru_destroy(&pool->list_lru); - - spin_lock(&zswap_pools_lock); - mem_cgroup_iter_break(NULL, pool->next_shrink); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - zpool_destroy_pool(pool->zpools[i]); - kfree(pool); -} - -static int __must_check zswap_pool_get(struct zswap_pool *pool) -{ - if (!pool) - return 0; - - return kref_get_unless_zero(&pool->kref); -} - -static void __zswap_pool_release(struct work_struct *work) -{ - struct zswap_pool *pool = container_of(work, typeof(*pool), - release_work); - - synchronize_rcu(); - - /* nobody should have been able to get a kref... */ - WARN_ON(kref_get_unless_zero(&pool->kref)); - - /* pool is now off zswap_pools list and has no references. */ - zswap_pool_destroy(pool); -} - -static void __zswap_pool_empty(struct kref *kref) -{ - struct zswap_pool *pool; - - pool = container_of(kref, typeof(*pool), kref); - - spin_lock(&zswap_pools_lock); - - WARN_ON(pool == zswap_pool_current()); - - list_del_rcu(&pool->list); - - INIT_WORK(&pool->release_work, __zswap_pool_release); - schedule_work(&pool->release_work); - - spin_unlock(&zswap_pools_lock); -} - -static void zswap_pool_put(struct zswap_pool *pool) -{ - kref_put(&pool->kref, __zswap_pool_empty); -} - -/********************************* -* param callbacks -**********************************/ - -static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) -{ - /* no change required */ - if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) - return false; - return true; -} - -/* val must be a null-terminated string */ -static int __zswap_param_set(const char *val, const struct kernel_param *kp, - char *type, char *compressor) -{ - struct zswap_pool *pool, *put_pool = NULL; - char *s = strstrip((char *)val); - int ret = 0; - bool new_pool = false; - - mutex_lock(&zswap_init_lock); - switch (zswap_init_state) { - case ZSWAP_UNINIT: - /* if this is load-time (pre-init) param setting, - * don't create a pool; that's done during init. - */ - ret = param_set_charp(s, kp); - break; - case ZSWAP_INIT_SUCCEED: - new_pool = zswap_pool_changed(s, kp); - break; - case ZSWAP_INIT_FAILED: - pr_err("can't set param, initialization failed\n"); - ret = -ENODEV; - } - mutex_unlock(&zswap_init_lock); - - /* no need to create a new pool, return directly */ - if (!new_pool) - return ret; - - if (!type) { - if (!zpool_has_pool(s)) { - pr_err("zpool %s not available\n", s); - return -ENOENT; - } - type = s; - } else if (!compressor) { - if (!crypto_has_acomp(s, 0, 0)) { - pr_err("compressor %s not available\n", s); - return -ENOENT; - } - compressor = s; + if (!mem_cgroup_disabled()) { + mem_cgroup_flush_stats(memcg); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); } else { - WARN_ON(1); - return -EINVAL; - } - - spin_lock(&zswap_pools_lock); - - pool = zswap_pool_find_get(type, compressor); - if (pool) { - zswap_pool_debug("using existing", pool); - WARN_ON(pool == zswap_pool_current()); - list_del_rcu(&pool->list); - } - - spin_unlock(&zswap_pools_lock); - - if (!pool) - pool = zswap_pool_create(type, compressor); - - if (pool) - ret = param_set_charp(s, kp); - else - ret = -EINVAL; - - spin_lock(&zswap_pools_lock); - - if (!ret) { - put_pool = zswap_pool_current(); - list_add_rcu(&pool->list, &zswap_pools); - zswap_has_pool = true; - } else if (pool) { - /* add the possibly pre-existing pool to the end of the pools - * list; if it's new (and empty) then it'll be removed and - * destroyed by the put after we drop the lock - */ - list_add_tail_rcu(&pool->list, &zswap_pools); - put_pool = pool; - } - - spin_unlock(&zswap_pools_lock); - - if (!zswap_has_pool && !pool) { - /* if initial pool creation failed, and this pool creation also - * failed, maybe both compressor and zpool params were bad. - * Allow changing this param, so pool creation will succeed - * when the other param is changed. We already verified this - * param is ok in the zpool_has_pool() or crypto_has_acomp() - * checks above. - */ - ret = param_set_charp(s, kp); + nr_backing = zswap_pool_total_size >> PAGE_SHIFT; + nr_stored = atomic_read(&zswap_nr_stored); } - /* drop the ref from either the old current pool, - * or the new pool we failed to add - */ - if (put_pool) - zswap_pool_put(put_pool); - - return ret; -} - -static int zswap_compressor_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, zswap_zpool_type, NULL); -} - -static int zswap_zpool_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, NULL, zswap_compressor); -} - -static int zswap_enabled_param_set(const char *val, - const struct kernel_param *kp) -{ - int ret = -ENODEV; - - /* if this is load-time (pre-init) param setting, only set param. */ - if (system_state != SYSTEM_RUNNING) - return param_set_bool(val, kp); + if (!nr_stored) + return 0; - mutex_lock(&zswap_init_lock); - switch (zswap_init_state) { - case ZSWAP_UNINIT: - if (zswap_setup()) - break; - fallthrough; - case ZSWAP_INIT_SUCCEED: - if (!zswap_has_pool) - pr_err("can't enable, no pool configured\n"); - else - ret = param_set_bool(val, kp); - break; - case ZSWAP_INIT_FAILED: - pr_err("can't enable, initialization failed\n"); - } - mutex_unlock(&zswap_init_lock); + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + /* + * Subtract the lru size by an estimate of the number of pages + * that should be protected. + */ + nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; - return ret; + /* + * Scale the number of freeable pages by the memory saving factor. + * This ensures that the better zswap compresses memory, the fewer + * pages we will evict to swap (as it will otherwise incur IO for + * relatively small memory saving). + */ + return mult_frac(nr_freeable, nr_backing, nr_stored); } -static void __zswap_load(struct zswap_entry *entry, struct page *page) +static struct shrinker *zswap_alloc_shrinker(void) { - struct zpool *zpool = zswap_find_zpool(entry); - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; - u8 *src; - - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); - - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { - memcpy(acomp_ctx->buffer, src, entry->length); - src = acomp_ctx->buffer; - zpool_unmap_handle(zpool, entry->handle); - } + struct shrinker *shrinker; - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(&acomp_ctx->mutex); + shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); + if (!shrinker) + return NULL; - if (zpool_can_sleep_mapped(zpool)) - zpool_unmap_handle(zpool, entry->handle); + shrinker->scan_objects = zswap_shrinker_scan; + shrinker->count_objects = zswap_shrinker_count; + shrinker->batch = 0; + shrinker->seeks = DEFAULT_SEEKS; + return shrinker; } -/********************************* -* writeback code -**********************************/ -/* - * Attempts to free an entry by adding a folio to the swap cache, - * decompressing the entry data into the folio, and issuing a - * bio write to write the folio back to the swap device. - * - * This can be thought of as a "resumed writeback" of the folio - * to the swap device. We are basically resuming the same swap - * writeback path that was intercepted with the zswap_store() - * in the first place. After the folio has been decompressed into - * the swap cache, the compressed version stored by zswap can be - * freed. - */ -static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree) +static int shrink_memcg(struct mem_cgroup *memcg) { - swp_entry_t swpentry = entry->swpentry; - struct folio *folio; - struct mempolicy *mpol; - bool folio_was_allocated; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; + int nid, shrunk = 0; - /* try to allocate swap cache folio */ - mpol = get_task_policy(current); - folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); - if (!folio) - return -ENOMEM; + if (!mem_cgroup_zswap_writeback_enabled(memcg)) + return -EINVAL; /* - * Found an existing folio, we raced with load/swapin. We generally - * writeback cold folios from zswap, and swapin means the folio just - * became hot. Skip this folio and let the caller find another one. + * Skip zombies because their LRUs are reparented and we would be + * reclaiming from the parent instead of the dead memcg. */ - if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; - } + if (memcg && !mem_cgroup_online(memcg)) + return -ENOENT; - /* - * folio is locked, and the swapcache is now secured against - * concurrent swapping to and from the slot. Verify that the - * swap entry hasn't been invalidated and recycled behind our - * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap folio with old compressed data. - */ - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { - spin_unlock(&tree->lock); - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long nr_to_walk = 1; + + shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, + &shrink_memcg_cb, NULL, &nr_to_walk); } - spin_unlock(&tree->lock); + return shrunk ? 0 : -EAGAIN; +} + +static void shrink_worker(struct work_struct *w) +{ + struct mem_cgroup *memcg; + int ret, failures = 0; - __zswap_load(entry, &folio->page); + /* global reclaim will select cgroup in a round-robin fashion. */ + do { + spin_lock(&zswap_shrink_lock); + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + memcg = zswap_next_shrink; - /* folio is up to date */ - folio_mark_uptodate(folio); + /* + * We need to retry if we have gone through a full round trip, or if we + * got an offline memcg (or else we risk undoing the effect of the + * zswap memcg offlining cleanup callback). This is not catastrophic + * per se, but it will keep the now offlined memcg hostage for a while. + * + * Note that if we got an online memcg, we will keep the extra + * reference in case the original reference obtained by mem_cgroup_iter + * is dropped by the zswap memcg offlining callback, ensuring that the + * memcg is not killed when we are reclaiming. + */ + if (!memcg) { + spin_unlock(&zswap_shrink_lock); + if (++failures == MAX_RECLAIM_RETRIES) + break; - /* move it to the tail of the inactive list after end_writeback */ - folio_set_reclaim(folio); + goto resched; + } - /* start writeback */ - __swap_writepage(folio, &wbc); - folio_put(folio); + if (!mem_cgroup_tryget_online(memcg)) { + /* drop the reference from mem_cgroup_iter() */ + mem_cgroup_iter_break(NULL, memcg); + zswap_next_shrink = NULL; + spin_unlock(&zswap_shrink_lock); - return 0; + if (++failures == MAX_RECLAIM_RETRIES) + break; + + goto resched; + } + spin_unlock(&zswap_shrink_lock); + + ret = shrink_memcg(memcg); + /* drop the extra reference */ + mem_cgroup_put(memcg); + + if (ret == -EINVAL) + break; + if (ret && ++failures == MAX_RECLAIM_RETRIES) + break; + +resched: + cond_resched(); + } while (!zswap_can_accept()); } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) @@ -1508,23 +1498,11 @@ static void zswap_fill_page(void *ptr, unsigned long value) bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); - struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - struct zswap_pool *pool; - struct zpool *zpool; - unsigned int dlen = PAGE_SIZE; - unsigned long handle, value; - char *buf; - u8 *src, *dst; - gfp_t gfp; - int ret; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1533,24 +1511,8 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - if (!tree) - return false; - - /* - * If this is a duplicate, it must be removed before attempting to store - * it, otherwise, if the store fails the old page won't be removed from - * the tree, and it might be written back overriding the new data. - */ - spin_lock(&tree->lock); - dupentry = zswap_rb_search(&tree->rbroot, offset); - if (dupentry) { - zswap_duplicate_entry++; - zswap_invalidate_entry(tree, dupentry); - } - spin_unlock(&tree->lock); - if (!zswap_enabled) - return false; + goto check_old; objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { @@ -1577,17 +1539,19 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); + entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } if (zswap_same_filled_pages_enabled) { - src = kmap_local_page(page); + unsigned long value; + u8 *src; + + src = kmap_local_folio(folio, 0); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); - entry->swpentry = swp_entry(type, offset); entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1606,74 +1570,18 @@ bool zswap_store(struct folio *folio) if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { + if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } mem_cgroup_put(memcg); } - /* compress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); - - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - - if (ret) { - zswap_reject_compress_fail++; - goto put_dstmem; - } - - /* store */ - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto put_dstmem; - } - if (ret) { - zswap_reject_alloc_fail++; - goto put_dstmem; - } - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - mutex_unlock(&acomp_ctx->mutex); - - /* populate entry */ - entry->swpentry = swp_entry(type, offset); - entry->handle = handle; - entry->length = dlen; + if (!zswap_compress(folio, entry)) + goto put_pool; insert_entry: + entry->swpentry = swp; entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); @@ -1684,20 +1592,17 @@ insert_entry: /* map */ spin_lock(&tree->lock); /* - * A duplicate entry should have been removed at the beginning of this - * function. Since the swap entry should be pinned, if a duplicate is - * found again here it means that something went wrong in the swap - * cache. + * The folio may have been dirtied again, invalidate the + * possibly stale entry before inserting the new entry. */ - while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - WARN_ON(1); - zswap_duplicate_entry++; + if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { zswap_invalidate_entry(tree, dupentry); + WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); } if (entry->length) { INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&entry->pool->list_lru, entry); - atomic_inc(&entry->pool->nr_stored); + zswap_lru_add(&zswap_list_lru, entry); + atomic_inc(&zswap_nr_stored); } spin_unlock(&tree->lock); @@ -1708,8 +1613,6 @@ insert_entry: return true; -put_dstmem: - mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: @@ -1717,38 +1620,60 @@ freepage: reject: if (objcg) obj_cgroup_put(objcg); +check_old: + /* + * If the zswap store fails or zswap is disabled, we must invalidate the + * possibly stale entry which was previously stored at this offset. + * Otherwise, writeback could overwrite the new data in the swapfile. + */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (entry) + zswap_invalidate_entry(tree, entry); + spin_unlock(&tree->lock); return false; shrink: - pool = zswap_pool_last_get(); - if (pool && !queue_work(shrink_wq, &pool->shrink_work)) - zswap_pool_put(pool); + queue_work(shrink_wq, &zswap_shrink_work); goto reject; } bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + bool swapcache = folio_test_swapcache(folio); + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); - /* find */ spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); + entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { spin_unlock(&tree->lock); return false; } + /* + * When reading into the swapcache, invalidate our entry. The + * swapcache can be the authoritative owner of the page and + * its mappings, and the pressure that results from having two + * in-memory copies outweighs any benefits of caching the + * compression work. + * + * (Most swapins go through the swapcache. The notable + * exception is the singleton fault on SWP_SYNCHRONOUS_IO + * files, which reads into a private page and may free it if + * the fault fails. We remain the primary owner of the entry.) + */ + if (swapcache) + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); if (entry->length) - __zswap_load(entry, page); + zswap_decompress(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); @@ -1759,67 +1684,64 @@ bool zswap_load(struct folio *folio) if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); - spin_lock(&tree->lock); - if (zswap_exclusive_loads_enabled) { - zswap_invalidate_entry(tree, entry); + if (swapcache) { + zswap_entry_free(entry); folio_mark_dirty(folio); - } else if (entry->length) { - zswap_lru_del(&entry->pool->list_lru, entry); - zswap_lru_add(&entry->pool->list_lru, entry); } - zswap_entry_put(tree, entry); - spin_unlock(&tree->lock); return true; } -void zswap_invalidate(int type, pgoff_t offset) +void zswap_invalidate(swp_entry_t swp) { - struct zswap_tree *tree = zswap_trees[type]; + pgoff_t offset = swp_offset(swp); + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - /* find */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - /* entry was written back */ - spin_unlock(&tree->lock); - return; - } - zswap_invalidate_entry(tree, entry); + if (entry) + zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } -void zswap_swapon(int type) +int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *tree; + struct zswap_tree *trees, *tree; + unsigned int nr, i; - tree = kzalloc(sizeof(*tree), GFP_KERNEL); - if (!tree) { + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); + if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); - return; + return -ENOMEM; + } + + for (i = 0; i < nr; i++) { + tree = trees + i; + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); } - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - zswap_trees[type] = tree; + nr_zswap_trees[type] = nr; + zswap_trees[type] = trees; + return 0; } void zswap_swapoff(int type) { - struct zswap_tree *tree = zswap_trees[type]; - struct zswap_entry *entry, *n; + struct zswap_tree *trees = zswap_trees[type]; + unsigned int i; - if (!tree) + if (!trees) return; - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - kfree(tree); + /* try_to_unuse() invalidated all the entries already */ + for (i = 0; i < nr_zswap_trees[type]; i++) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); + + kvfree(trees); + nr_zswap_trees[type] = 0; zswap_trees[type] = NULL; } @@ -1852,8 +1774,6 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("duplicate_entry", 0444, - zswap_debugfs_root, &zswap_duplicate_entry); debugfs_create_u64("pool_total_size", 0444, zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", 0444, @@ -1891,6 +1811,20 @@ static int zswap_setup(void) if (ret) goto hp_fail; + shrink_wq = alloc_workqueue("zswap-shrink", + WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + if (!shrink_wq) + goto shrink_wq_fail; + + zswap_shrinker = zswap_alloc_shrinker(); + if (!zswap_shrinker) + goto shrinker_fail; + if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) + goto lru_fail; + shrinker_register(zswap_shrinker); + + INIT_WORK(&zswap_shrink_work, shrink_worker); + pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, @@ -1902,18 +1836,17 @@ static int zswap_setup(void) zswap_enabled = false; } - shrink_wq = create_workqueue("zswap-shrink"); - if (!shrink_wq) - goto fallback_fail; - if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; -fallback_fail: - if (pool) - zswap_pool_destroy(pool); +lru_fail: + shrinker_free(zswap_shrinker); +shrinker_fail: + destroy_workqueue(shrink_wq); +shrink_wq_fail: + cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: -- cgit v1.2.3