diff options
Diffstat (limited to 'mm/zswap.c')
-rw-r--r-- | mm/zswap.c | 1831 |
1 files changed, 882 insertions, 949 deletions
diff --git a/mm/zswap.c b/mm/zswap.c index 69766f2c5a..6f8850c44b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -71,8 +71,6 @@ static u64 zswap_reject_compress_poor; static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; -/* Duplicate store was encountered (rare) */ -static u64 zswap_duplicate_entry; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; @@ -141,10 +139,6 @@ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); -static bool zswap_exclusive_loads_enabled = IS_ENABLED( - CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); -module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); - /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 @@ -168,6 +162,7 @@ struct crypto_acomp_ctx { struct crypto_wait wait; u8 *buffer; struct mutex mutex; + bool is_sleepable; }; /* @@ -179,18 +174,24 @@ struct crypto_acomp_ctx { struct zswap_pool { struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; - struct kref kref; + struct percpu_ref ref; struct list_head list; struct work_struct release_work; - struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; - struct list_lru list_lru; - struct mem_cgroup *next_shrink; - struct shrinker *shrinker; - atomic_t nr_stored; }; +/* Global LRU lists shared by all zswap pools. */ +static struct list_lru zswap_list_lru; +/* counter of pages stored in all zswap pools. */ +static atomic_t zswap_nr_stored = ATOMIC_INIT(0); + +/* The lock protects zswap_next_shrink updates. */ +static DEFINE_SPINLOCK(zswap_shrink_lock); +static struct mem_cgroup *zswap_next_shrink; +static struct work_struct zswap_shrink_work; +static struct shrinker *zswap_shrinker; + /* * struct zswap_entry * @@ -199,12 +200,6 @@ struct zswap_pool { * * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree - * refcount - the number of outstanding reference to the entry. This is needed - * to protect against premature freeing of the entry by code - * concurrent calls to load, invalidate, and writeback. The lock - * for the zswap_tree structure that contains the entry must - * be held while changing the refcount. Since the lock must - * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. @@ -217,7 +212,6 @@ struct zswap_pool { struct zswap_entry { struct rb_node rbnode; swp_entry_t swpentry; - int refcount; unsigned int length; struct zswap_pool *pool; union { @@ -228,17 +222,13 @@ struct zswap_entry { struct list_head lru; }; -/* - * The tree lock in the zswap_tree struct protects a few things: - * - the rbtree - * - the refcount field of each entry in the tree - */ struct zswap_tree { struct rb_root rbroot; spinlock_t lock; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); @@ -265,15 +255,16 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ +static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +{ + return &zswap_trees[swp_type(swp)][swp_offset(swp) + >> SWAP_ADDRESS_SPACE_SHIFT]; +} + #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) -static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree); -static int zswap_pool_get(struct zswap_pool *pool); -static void zswap_pool_put(struct zswap_pool *pool); - static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < @@ -313,717 +304,10 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } -/* should be called under RCU */ -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) -{ - return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; -} -#else -static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) -{ - return NULL; -} -#endif - -static inline int entry_to_nid(struct zswap_entry *entry) -{ - return page_to_nid(virt_to_page(entry)); -} - -void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) -{ - struct zswap_pool *pool; - - /* lock out zswap pools list modification */ - spin_lock(&zswap_pools_lock); - list_for_each_entry(pool, &zswap_pools, list) { - if (pool->next_shrink == memcg) - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - } - spin_unlock(&zswap_pools_lock); -} - -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; - -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) -{ - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; -} - -static void zswap_entry_cache_free(struct zswap_entry *entry) -{ - kmem_cache_free(zswap_entry_cache, entry); -} - -/********************************* -* zswap lruvec functions -**********************************/ -void zswap_lruvec_state_init(struct lruvec *lruvec) -{ - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); -} - -void zswap_folio_swapin(struct folio *folio) -{ - struct lruvec *lruvec; - - VM_WARN_ON_ONCE(!folio_test_locked(folio)); - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); -} - -/********************************* -* lru functions -**********************************/ -static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) -{ - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; - int nid = entry_to_nid(entry); - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - /* - * Note that it is safe to use rcu_read_lock() here, even in the face of - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection - * used in list_lru lookup, only two scenarios are possible: - * - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The - * new entry will be reparented to memcg's parent's list_lru. - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The - * new entry will be added directly to memcg's parent's list_lru. - * - * Similar reasoning holds for list_lru_del() and list_lru_putback(). - */ - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - /* will always succeed */ - list_lru_add(list_lru, &entry->lru, nid, memcg); - - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); - rcu_read_unlock(); -} - -static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) -{ - int nid = entry_to_nid(entry); - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - /* will always succeed */ - list_lru_del(list_lru, &entry->lru, nid, memcg); - rcu_read_unlock(); -} - -static void zswap_lru_putback(struct list_lru *list_lru, - struct zswap_entry *entry) -{ - int nid = entry_to_nid(entry); - spinlock_t *lock = &list_lru->node[nid].lock; - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - spin_lock(lock); - /* we cannot use list_lru_add here, because it increments node's lru count */ - list_lru_putback(list_lru, &entry->lru, nid, memcg); - spin_unlock(lock); - - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); - /* increment the protection area to account for the LRU rotation. */ - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - rcu_read_unlock(); -} - -/********************************* -* rbtree functions -**********************************/ -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) -{ - struct rb_node *node = root->rb_node; - struct zswap_entry *entry; - pgoff_t entry_offset; - - while (node) { - entry = rb_entry(node, struct zswap_entry, rbnode); - entry_offset = swp_offset(entry->swpentry); - if (entry_offset > offset) - node = node->rb_left; - else if (entry_offset < offset) - node = node->rb_right; - else - return entry; - } - return NULL; -} - -/* - * In the case that a entry with the same offset is found, a pointer to - * the existing entry is stored in dupentry and the function returns -EEXIST - */ -static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, - struct zswap_entry **dupentry) -{ - struct rb_node **link = &root->rb_node, *parent = NULL; - struct zswap_entry *myentry; - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); - - while (*link) { - parent = *link; - myentry = rb_entry(parent, struct zswap_entry, rbnode); - myentry_offset = swp_offset(myentry->swpentry); - if (myentry_offset > entry_offset) - link = &(*link)->rb_left; - else if (myentry_offset < entry_offset) - link = &(*link)->rb_right; - else { - *dupentry = myentry; - return -EEXIST; - } - } - rb_link_node(&entry->rbnode, parent, link); - rb_insert_color(&entry->rbnode, root); - return 0; -} - -static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) -{ - if (!RB_EMPTY_NODE(&entry->rbnode)) { - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); - return true; - } - return false; -} - -static struct zpool *zswap_find_zpool(struct zswap_entry *entry) -{ - int i = 0; - - if (ZSWAP_NR_ZPOOLS > 1) - i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); - - return entry->pool->zpools[i]; -} - -/* - * Carries out the common pattern of freeing and entry's zpool allocation, - * freeing the entry itself, and decrementing the number of stored pages. - */ -static void zswap_free_entry(struct zswap_entry *entry) -{ - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zswap_lru_del(&entry->pool->list_lru, entry); - zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&entry->pool->nr_stored); - zswap_pool_put(entry->pool); - } - if (entry->objcg) { - obj_cgroup_uncharge_zswap(entry->objcg, entry->length); - obj_cgroup_put(entry->objcg); - } - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - zswap_update_total_size(); -} - -/* caller must hold the tree lock */ -static void zswap_entry_get(struct zswap_entry *entry) -{ - entry->refcount++; -} - -/* caller must hold the tree lock -* remove from the tree and free it, if nobody reference the entry -*/ -static void zswap_entry_put(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - int refcount = --entry->refcount; - - WARN_ON_ONCE(refcount < 0); - if (refcount == 0) { - WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); - zswap_free_entry(entry); - } -} - -/* caller must hold the tree lock */ -static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, - pgoff_t offset) -{ - struct zswap_entry *entry; - - entry = zswap_rb_search(root, offset); - if (entry) - zswap_entry_get(entry); - - return entry; -} - -/********************************* -* shrinker functions -**********************************/ -static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg); - -static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, - struct shrink_control *sc) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; - struct zswap_pool *pool = shrinker->private_data; - bool encountered_page_in_swapcache = false; - - if (!zswap_shrinker_enabled || - !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&pool->list_lru, sc); - - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). - */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - - shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, - &encountered_page_in_swapcache); - - if (encountered_page_in_swapcache) - return SHRINK_STOP; - - return shrink_ret ? shrink_ret : SHRINK_STOP; -} - -static unsigned long zswap_shrinker_count(struct shrinker *shrinker, - struct shrink_control *sc) -{ - struct zswap_pool *pool = shrinker->private_data; - struct mem_cgroup *memcg = sc->memcg; - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; - - if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) - return 0; - - /* - * The shrinker resumes swap writeback, which will enter block - * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS - * rules (may_enter_fs()), which apply on a per-folio basis. - */ - if (!gfp_has_io_fs(sc->gfp_mask)) - return 0; - - /* - * For memcg, use the cgroup-wide ZSWAP stats since we don't - * have them per-node and thus per-lruvec. Careful if memcg is - * runtime-disabled: we can get sc->memcg == NULL, which is ok - * for the lruvec, but not for memcg_page_state(). - * - * Without memcg, use the zswap pool-wide metrics. - */ - if (!mem_cgroup_disabled()) { - mem_cgroup_flush_stats(memcg); - nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; - nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); - } else { - nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; - nr_stored = atomic_read(&pool->nr_stored); - } - - if (!nr_stored) - return 0; - - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); - /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. - */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; - - /* - * Scale the number of freeable pages by the memory saving factor. - * This ensures that the better zswap compresses memory, the fewer - * pages we will evict to swap (as it will otherwise incur IO for - * relatively small memory saving). - */ - return mult_frac(nr_freeable, nr_backing, nr_stored); -} - -static void zswap_alloc_shrinker(struct zswap_pool *pool) -{ - pool->shrinker = - shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); - if (!pool->shrinker) - return; - - pool->shrinker->private_data = pool; - pool->shrinker->scan_objects = zswap_shrinker_scan; - pool->shrinker->count_objects = zswap_shrinker_count; - pool->shrinker->batch = 0; - pool->shrinker->seeks = DEFAULT_SEEKS; -} - -/********************************* -* per-cpu code -**********************************/ -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; - int ret; - - mutex_init(&acomp_ctx->mutex); - - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; - - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); - if (IS_ERR(acomp)) { - pr_err("could not alloc crypto acomp %s : %ld\n", - pool->tfm_name, PTR_ERR(acomp)); - ret = PTR_ERR(acomp); - goto acomp_fail; - } - acomp_ctx->acomp = acomp; - - req = acomp_request_alloc(acomp_ctx->acomp); - if (!req) { - pr_err("could not alloc crypto acomp_request %s\n", - pool->tfm_name); - ret = -ENOMEM; - goto req_fail; - } - acomp_ctx->req = req; - - crypto_init_wait(&acomp_ctx->wait); - /* - * if the backend of acomp is async zip, crypto_req_done() will wakeup - * crypto_wait_req(); if the backend of acomp is scomp, the callback - * won't be called, crypto_wait_req() will return without blocking. - */ - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &acomp_ctx->wait); - - return 0; - -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); - return ret; -} - -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } - - return 0; -} - /********************************* * pool functions **********************************/ - -static struct zswap_pool *__zswap_pool_current(void) -{ - struct zswap_pool *pool; - - pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); - WARN_ONCE(!pool && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - - return pool; -} - -static struct zswap_pool *zswap_pool_current(void) -{ - assert_spin_locked(&zswap_pools_lock); - - return __zswap_pool_current(); -} - -static struct zswap_pool *zswap_pool_current_get(void) -{ - struct zswap_pool *pool; - - rcu_read_lock(); - - pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) - pool = NULL; - - rcu_read_unlock(); - - return pool; -} - -static struct zswap_pool *zswap_pool_last_get(void) -{ - struct zswap_pool *pool, *last = NULL; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - last = pool; - WARN_ONCE(!last && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - if (!zswap_pool_get(last)) - last = NULL; - - rcu_read_unlock(); - - return last; -} - -/* type and compressor must be null-terminated */ -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) -{ - struct zswap_pool *pool; - - assert_spin_locked(&zswap_pools_lock); - - list_for_each_entry_rcu(pool, &zswap_pools, list) { - if (strcmp(pool->tfm_name, compressor)) - continue; - /* all zpools share the same type */ - if (strcmp(zpool_get_type(pool->zpools[0]), type)) - continue; - /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) - continue; - return pool; - } - - return NULL; -} - -/* - * If the entry is still valid in the tree, drop the initial ref and remove it - * from the tree. This function must be called with an additional ref held, - * otherwise it may race with another invalidation freeing the entry. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(tree, entry); -} - -static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg) -{ - struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); - bool *encountered_page_in_swapcache = (bool *)arg; - struct zswap_tree *tree; - pgoff_t swpoffset; - enum lru_status ret = LRU_REMOVED_RETRY; - int writeback_result; - - /* - * Once the lru lock is dropped, the entry might get freed. The - * swpoffset is copied to the stack, and entry isn't deref'd again - * until the entry is verified to still be alive in the tree. - */ - swpoffset = swp_offset(entry->swpentry); - tree = zswap_trees[swp_type(entry->swpentry)]; - list_lru_isolate(l, item); - /* - * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. - */ - spin_unlock(lock); - - /* Check for invalidate() race */ - spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) - goto unlock; - - /* Hold a reference to prevent a free during writeback */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); - - writeback_result = zswap_writeback_entry(entry, tree); - - spin_lock(&tree->lock); - if (writeback_result) { - zswap_reject_reclaim_fail++; - zswap_lru_putback(&entry->pool->list_lru, entry); - ret = LRU_RETRY; - - /* - * Encountering a page already in swap cache is a sign that we are shrinking - * into the warmer region. We should terminate shrinking (if we're in the dynamic - * shrinker context). - */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) - *encountered_page_in_swapcache = true; - - goto put_unlock; - } - zswap_written_back_pages++; - - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - - count_vm_event(ZSWPWB); - /* - * Writeback started successfully, the page now belongs to the - * swapcache. Drop the entry from zswap - unless invalidate already - * took it out while we had the tree->lock released for IO. - */ - zswap_invalidate_entry(tree, entry); - -put_unlock: - /* Drop local reference */ - zswap_entry_put(tree, entry); -unlock: - spin_unlock(&tree->lock); - spin_lock(lock); - return ret; -} - -static int shrink_memcg(struct mem_cgroup *memcg) -{ - struct zswap_pool *pool; - int nid, shrunk = 0; - - if (!mem_cgroup_zswap_writeback_enabled(memcg)) - return -EINVAL; - - /* - * Skip zombies because their LRUs are reparented and we would be - * reclaiming from the parent instead of the dead memcg. - */ - if (memcg && !mem_cgroup_online(memcg)) - return -ENOENT; - - pool = zswap_pool_current_get(); - if (!pool) - return -EINVAL; - - for_each_node_state(nid, N_NORMAL_MEMORY) { - unsigned long nr_to_walk = 1; - - shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, - &shrink_memcg_cb, NULL, &nr_to_walk); - } - zswap_pool_put(pool); - return shrunk ? 0 : -EAGAIN; -} - -static void shrink_worker(struct work_struct *w) -{ - struct zswap_pool *pool = container_of(w, typeof(*pool), - shrink_work); - struct mem_cgroup *memcg; - int ret, failures = 0; - - /* global reclaim will select cgroup in a round-robin fashion. */ - do { - spin_lock(&zswap_pools_lock); - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - memcg = pool->next_shrink; - - /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. - * - * Note that if we got an online memcg, we will keep the extra - * reference in case the original reference obtained by mem_cgroup_iter - * is dropped by the zswap memcg offlining callback, ensuring that the - * memcg is not killed when we are reclaiming. - */ - if (!memcg) { - spin_unlock(&zswap_pools_lock); - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - - if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); - - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - spin_unlock(&zswap_pools_lock); - - ret = shrink_memcg(memcg); - /* drop the extra reference */ - mem_cgroup_put(memcg); - - if (ret == -EINVAL) - break; - if (ret && ++failures == MAX_RECLAIM_RETRIES) - break; - -resched: - cond_resched(); - } while (!zswap_can_accept()); - zswap_pool_put(pool); -} +static void __zswap_pool_empty(struct percpu_ref *ref); static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { @@ -1074,30 +358,21 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (ret) goto error; - zswap_alloc_shrinker(pool); - if (!pool->shrinker) - goto error; - - pr_debug("using %s compressor\n", pool->tfm_name); - /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ - kref_init(&pool->kref); + ret = percpu_ref_init(&pool->ref, __zswap_pool_empty, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (ret) + goto ref_fail; INIT_LIST_HEAD(&pool->list); - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) - goto lru_fail; - shrinker_register(pool->shrinker); - INIT_WORK(&pool->shrink_work, shrink_worker); - atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; -lru_fail: - list_lru_destroy(&pool->list_lru); - shrinker_free(pool->shrinker); +ref_fail: + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -1155,29 +430,14 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); - shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - list_lru_destroy(&pool->list_lru); - - spin_lock(&zswap_pools_lock); - mem_cgroup_iter_break(NULL, pool->next_shrink); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); } -static int __must_check zswap_pool_get(struct zswap_pool *pool) -{ - if (!pool) - return 0; - - return kref_get_unless_zero(&pool->kref); -} - static void __zswap_pool_release(struct work_struct *work) { struct zswap_pool *pool = container_of(work, typeof(*pool), @@ -1185,20 +445,23 @@ static void __zswap_pool_release(struct work_struct *work) synchronize_rcu(); - /* nobody should have been able to get a kref... */ - WARN_ON(kref_get_unless_zero(&pool->kref)); + /* nobody should have been able to get a ref... */ + WARN_ON(!percpu_ref_is_zero(&pool->ref)); + percpu_ref_exit(&pool->ref); /* pool is now off zswap_pools list and has no references. */ zswap_pool_destroy(pool); } -static void __zswap_pool_empty(struct kref *kref) +static struct zswap_pool *zswap_pool_current(void); + +static void __zswap_pool_empty(struct percpu_ref *ref) { struct zswap_pool *pool; - pool = container_of(kref, typeof(*pool), kref); + pool = container_of(ref, typeof(*pool), ref); - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); WARN_ON(pool == zswap_pool_current()); @@ -1207,12 +470,75 @@ static void __zswap_pool_empty(struct kref *kref) INIT_WORK(&pool->release_work, __zswap_pool_release); schedule_work(&pool->release_work); - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); +} + +static int __must_check zswap_pool_get(struct zswap_pool *pool) +{ + if (!pool) + return 0; + + return percpu_ref_tryget(&pool->ref); } static void zswap_pool_put(struct zswap_pool *pool) { - kref_put(&pool->kref, __zswap_pool_empty); + percpu_ref_put(&pool->ref); +} + +static struct zswap_pool *__zswap_pool_current(void) +{ + struct zswap_pool *pool; + + pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); + WARN_ONCE(!pool && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + + return pool; +} + +static struct zswap_pool *zswap_pool_current(void) +{ + assert_spin_locked(&zswap_pools_lock); + + return __zswap_pool_current(); +} + +static struct zswap_pool *zswap_pool_current_get(void) +{ + struct zswap_pool *pool; + + rcu_read_lock(); + + pool = __zswap_pool_current(); + if (!zswap_pool_get(pool)) + pool = NULL; + + rcu_read_unlock(); + + return pool; +} + +/* type and compressor must be null-terminated */ +static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +{ + struct zswap_pool *pool; + + assert_spin_locked(&zswap_pools_lock); + + list_for_each_entry_rcu(pool, &zswap_pools, list) { + if (strcmp(pool->tfm_name, compressor)) + continue; + /* all zpools share the same type */ + if (strcmp(zpool_get_type(pool->zpools[0]), type)) + continue; + /* if we can't get it, it's about to be destroyed */ + if (!zswap_pool_get(pool)) + continue; + return pool; + } + + return NULL; } /********************************* @@ -1274,7 +600,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, return -EINVAL; } - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); pool = zswap_pool_find_get(type, compressor); if (pool) { @@ -1283,17 +609,28 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, list_del_rcu(&pool->list); } - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); if (!pool) pool = zswap_pool_create(type, compressor); + else { + /* + * Restore the initial ref dropped by percpu_ref_kill() + * when the pool was decommissioned and switch it again + * to percpu mode. + */ + percpu_ref_resurrect(&pool->ref); + + /* Drop the ref from zswap_pool_find_get(). */ + zswap_pool_put(pool); + } if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); if (!ret) { put_pool = zswap_pool_current(); @@ -1308,7 +645,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, put_pool = pool; } - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also @@ -1325,7 +662,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, * or the new pool we failed to add */ if (put_pool) - zswap_pool_put(put_pool); + percpu_ref_kill(&put_pool->ref); return ret; } @@ -1371,7 +708,368 @@ static int zswap_enabled_param_set(const char *val, return ret; } -static void __zswap_load(struct zswap_entry *entry, struct page *page) +/********************************* +* lru functions +**********************************/ + +/* should be called under RCU */ +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return NULL; +} +#endif + +static inline int entry_to_nid(struct zswap_entry *entry) +{ + return page_to_nid(virt_to_page(entry)); +} + +static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +{ + atomic_long_t *nr_zswap_protected; + unsigned long lru_size, old, new; + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + /* + * Note that it is safe to use rcu_read_lock() here, even in the face of + * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection + * used in list_lru lookup, only two scenarios are possible: + * + * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * new entry will be reparented to memcg's parent's list_lru. + * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * new entry will be added directly to memcg's parent's list_lru. + * + * Similar reasoning holds for list_lru_del(). + */ + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_add(list_lru, &entry->lru, nid, memcg); + + /* Update the protection area */ + lru_size = list_lru_count_one(list_lru, nid, memcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; + old = atomic_long_inc_return(nr_zswap_protected); + /* + * Decay to avoid overflow and adapt to changing workloads. + * This is based on LRU reclaim cost decaying heuristics. + */ + do { + new = old > lru_size / 4 ? old / 2 : old; + } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); + rcu_read_unlock(); +} + +static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_del(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); +} + +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + /* lock out zswap shrinker walking memcg tree */ + spin_lock(&zswap_shrink_lock); + if (zswap_next_shrink == memcg) + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + spin_unlock(&zswap_shrink_lock); +} + +/********************************* +* rbtree functions +**********************************/ +static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) +{ + struct rb_node *node = root->rb_node; + struct zswap_entry *entry; + pgoff_t entry_offset; + + while (node) { + entry = rb_entry(node, struct zswap_entry, rbnode); + entry_offset = swp_offset(entry->swpentry); + if (entry_offset > offset) + node = node->rb_left; + else if (entry_offset < offset) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * In the case that a entry with the same offset is found, a pointer to + * the existing entry is stored in dupentry and the function returns -EEXIST + */ +static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, + struct zswap_entry **dupentry) +{ + struct rb_node **link = &root->rb_node, *parent = NULL; + struct zswap_entry *myentry; + pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); + + while (*link) { + parent = *link; + myentry = rb_entry(parent, struct zswap_entry, rbnode); + myentry_offset = swp_offset(myentry->swpentry); + if (myentry_offset > entry_offset) + link = &(*link)->rb_left; + else if (myentry_offset < entry_offset) + link = &(*link)->rb_right; + else { + *dupentry = myentry; + return -EEXIST; + } + } + rb_link_node(&entry->rbnode, parent, link); + rb_insert_color(&entry->rbnode, root); + return 0; +} + +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); +} + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + +static struct zpool *zswap_find_zpool(struct zswap_entry *entry) +{ + int i = 0; + + if (ZSWAP_NR_ZPOOLS > 1) + i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); + + return entry->pool->zpools[i]; +} + +/* + * Carries out the common pattern of freeing and entry's zpool allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_entry_free(struct zswap_entry *entry) +{ + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zswap_lru_del(&zswap_list_lru, entry); + zpool_free(zswap_find_zpool(entry), entry->handle); + atomic_dec(&zswap_nr_stored); + zswap_pool_put(entry->pool); + } + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_update_total_size(); +} + +/* + * The caller hold the tree lock and search the entry from the tree, + * so it must be on the tree, remove it from the tree and free it. + */ +static void zswap_invalidate_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + zswap_rb_erase(&tree->rbroot, entry); + zswap_entry_free(entry); +} + +/********************************* +* compressed storage functions +**********************************/ +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct crypto_acomp *acomp; + struct acomp_req *req; + int ret; + + mutex_init(&acomp_ctx->mutex); + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return -ENOMEM; + + acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp)) { + pr_err("could not alloc crypto acomp %s : %ld\n", + pool->tfm_name, PTR_ERR(acomp)); + ret = PTR_ERR(acomp); + goto acomp_fail; + } + acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); + + req = acomp_request_alloc(acomp_ctx->acomp); + if (!req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + ret = -ENOMEM; + goto req_fail; + } + acomp_ctx->req = req; + + crypto_init_wait(&acomp_ctx->wait); + /* + * if the backend of acomp is async zip, crypto_req_done() will wakeup + * crypto_wait_req(); if the backend of acomp is scomp, the callback + * won't be called, crypto_wait_req() will return without blocking. + */ + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &acomp_ctx->wait); + + return 0; + +req_fail: + crypto_free_acomp(acomp_ctx->acomp); +acomp_fail: + kfree(acomp_ctx->buffer); + return ret; +} + +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + + if (!IS_ERR_OR_NULL(acomp_ctx)) { + if (!IS_ERR_OR_NULL(acomp_ctx->req)) + acomp_request_free(acomp_ctx->req); + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); + } + + return 0; +} + +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +{ + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + int comp_ret = 0, alloc_ret = 0; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + u8 *dst; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(&acomp_ctx->mutex); + + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (comp_ret) + goto unlock; + + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (alloc_ret) + goto unlock; + + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); + + entry->handle = handle; + entry->length = dlen; + +unlock: + if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) + zswap_reject_compress_poor++; + else if (comp_ret) + zswap_reject_compress_fail++; + else if (alloc_ret) + zswap_reject_alloc_fail++; + + mutex_unlock(&acomp_ctx->mutex); + return comp_ret == 0 && alloc_ret == 0; +} + +static void zswap_decompress(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); struct scatterlist input, output; @@ -1382,7 +1080,17 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { + /* + * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer + * to do crypto_acomp_decompress() which might sleep. In such cases, we must + * resort to copying the buffer to a temporary one. + * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, + * such as a kmap address of high memory or even ever a vmap address. + * However, sg_init_one is only equipped to handle linearly mapped low memory. + * In such cases, we also must copy the buffer to a temporary and lowmem one. + */ + if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || + !virt_addr_valid(src)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); @@ -1396,7 +1104,7 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); mutex_unlock(&acomp_ctx->mutex); - if (zpool_can_sleep_mapped(zpool)) + if (src != acomp_ctx->buffer) zpool_unmap_handle(zpool, entry->handle); } @@ -1416,9 +1124,9 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree) + swp_entry_t swpentry) { - swp_entry_t swpentry = entry->swpentry; + struct zswap_tree *tree; struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; @@ -1434,9 +1142,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -ENOMEM; /* - * Found an existing folio, we raced with load/swapin. We generally - * writeback cold folios from zswap, and swapin means the folio just - * became hot. Skip this folio and let the caller find another one. + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { folio_put(folio); @@ -1445,22 +1155,34 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* * folio is locked, and the swapcache is now secured against - * concurrent swapping to and from the slot. Verify that the - * swap entry hasn't been invalidated and recycled behind our - * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap folio with old compressed data. + * concurrent swapping to and from the slot, and concurrent + * swapoff so we can safely dereference the zswap tree here. + * Verify that the swap entry hasn't been invalidated and recycled + * behind our backs, to avoid overwriting a new swap folio with + * old compressed data. Only when this is successful can the entry + * be dereferenced. */ + tree = swap_zswap_tree(swpentry); spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { + if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } + + /* Safe to deref entry after the entry is verified above. */ + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); - __zswap_load(entry, &folio->page); + zswap_decompress(entry, &folio->page); + + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + zswap_entry_free(entry); /* folio is up to date */ folio_mark_uptodate(folio); @@ -1475,6 +1197,274 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return 0; } +/********************************* +* shrinker functions +**********************************/ +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg) +{ + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; + swp_entry_t swpentry; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; + + /* + * As soon as we drop the LRU lock, the entry can be freed by + * a concurrent invalidation. This means the following: + * + * 1. We extract the swp_entry_t to the stack, allowing + * zswap_writeback_entry() to pin the swap entry and + * then validate the zwap entry against that swap entry's + * tree using pointer value comparison. Only when that + * is successful can the entry be dereferenced. + * + * 2. Usually, objects are taken off the LRU for reclaim. In + * this case this isn't possible, because if reclaim fails + * for whatever reason, we have no means of knowing if the + * entry is alive to put it back on the LRU. + * + * So rotate it before dropping the lock. If the entry is + * written back or invalidated, the free path will unlink + * it. For failures, rotation is the right thing as well. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + */ + list_move_tail(item, &l->list); + + /* + * Once the lru lock is dropped, the entry might get freed. The + * swpentry is copied to the stack, and entry isn't deref'd again + * until the entry is verified to still be alive in the tree. + */ + swpentry = entry->swpentry; + + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); + + writeback_result = zswap_writeback_entry(entry, swpentry); + + if (writeback_result) { + zswap_reject_reclaim_fail++; + ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_STOP; + *encountered_page_in_swapcache = true; + } + } else { + zswap_written_back_pages++; + } + + spin_lock(lock); + return ret; +} + +static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + unsigned long shrink_ret, nr_protected, lru_size; + bool encountered_page_in_swapcache = false; + + if (!zswap_shrinker_enabled || + !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + lru_size = list_lru_shrink_count(&zswap_list_lru, sc); + + /* + * Abort if we are shrinking into the protected region. + * + * This short-circuiting is necessary because if we have too many multiple + * concurrent reclaimers getting the freeable zswap object counts at the + * same time (before any of them made reasonable progress), the total + * number of reclaimed objects might be more than the number of unprotected + * objects (i.e the reclaimers will reclaim into the protected area of the + * zswap LRU). + */ + if (nr_protected >= lru_size - sc->nr_to_scan) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, + &encountered_page_in_swapcache); + + if (encountered_page_in_swapcache) + return SHRINK_STOP; + + return shrink_ret ? shrink_ret : SHRINK_STOP; +} + +static unsigned long zswap_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct mem_cgroup *memcg = sc->memcg; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); + unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + + if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) + return 0; + + /* + * The shrinker resumes swap writeback, which will enter block + * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS + * rules (may_enter_fs()), which apply on a per-folio basis. + */ + if (!gfp_has_io_fs(sc->gfp_mask)) + return 0; + + /* + * For memcg, use the cgroup-wide ZSWAP stats since we don't + * have them per-node and thus per-lruvec. Careful if memcg is + * runtime-disabled: we can get sc->memcg == NULL, which is ok + * for the lruvec, but not for memcg_page_state(). + * + * Without memcg, use the zswap pool-wide metrics. + */ + if (!mem_cgroup_disabled()) { + mem_cgroup_flush_stats(memcg); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); + } else { + nr_backing = zswap_pool_total_size >> PAGE_SHIFT; + nr_stored = atomic_read(&zswap_nr_stored); + } + + if (!nr_stored) + return 0; + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + /* + * Subtract the lru size by an estimate of the number of pages + * that should be protected. + */ + nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + + /* + * Scale the number of freeable pages by the memory saving factor. + * This ensures that the better zswap compresses memory, the fewer + * pages we will evict to swap (as it will otherwise incur IO for + * relatively small memory saving). + */ + return mult_frac(nr_freeable, nr_backing, nr_stored); +} + +static struct shrinker *zswap_alloc_shrinker(void) +{ + struct shrinker *shrinker; + + shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); + if (!shrinker) + return NULL; + + shrinker->scan_objects = zswap_shrinker_scan; + shrinker->count_objects = zswap_shrinker_count; + shrinker->batch = 0; + shrinker->seeks = DEFAULT_SEEKS; + return shrinker; +} + +static int shrink_memcg(struct mem_cgroup *memcg) +{ + int nid, shrunk = 0; + + if (!mem_cgroup_zswap_writeback_enabled(memcg)) + return -EINVAL; + + /* + * Skip zombies because their LRUs are reparented and we would be + * reclaiming from the parent instead of the dead memcg. + */ + if (memcg && !mem_cgroup_online(memcg)) + return -ENOENT; + + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long nr_to_walk = 1; + + shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, + &shrink_memcg_cb, NULL, &nr_to_walk); + } + return shrunk ? 0 : -EAGAIN; +} + +static void shrink_worker(struct work_struct *w) +{ + struct mem_cgroup *memcg; + int ret, failures = 0; + + /* global reclaim will select cgroup in a round-robin fashion. */ + do { + spin_lock(&zswap_shrink_lock); + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + memcg = zswap_next_shrink; + + /* + * We need to retry if we have gone through a full round trip, or if we + * got an offline memcg (or else we risk undoing the effect of the + * zswap memcg offlining cleanup callback). This is not catastrophic + * per se, but it will keep the now offlined memcg hostage for a while. + * + * Note that if we got an online memcg, we will keep the extra + * reference in case the original reference obtained by mem_cgroup_iter + * is dropped by the zswap memcg offlining callback, ensuring that the + * memcg is not killed when we are reclaiming. + */ + if (!memcg) { + spin_unlock(&zswap_shrink_lock); + if (++failures == MAX_RECLAIM_RETRIES) + break; + + goto resched; + } + + if (!mem_cgroup_tryget_online(memcg)) { + /* drop the reference from mem_cgroup_iter() */ + mem_cgroup_iter_break(NULL, memcg); + zswap_next_shrink = NULL; + spin_unlock(&zswap_shrink_lock); + + if (++failures == MAX_RECLAIM_RETRIES) + break; + + goto resched; + } + spin_unlock(&zswap_shrink_lock); + + ret = shrink_memcg(memcg); + /* drop the extra reference */ + mem_cgroup_put(memcg); + + if (ret == -EINVAL) + break; + if (ret && ++failures == MAX_RECLAIM_RETRIES) + break; + +resched: + cond_resched(); + } while (!zswap_can_accept()); +} + static int zswap_is_page_same_filled(void *ptr, unsigned long *value) { unsigned long *page; @@ -1508,23 +1498,11 @@ static void zswap_fill_page(void *ptr, unsigned long value) bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); - struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - struct zswap_pool *pool; - struct zpool *zpool; - unsigned int dlen = PAGE_SIZE; - unsigned long handle, value; - char *buf; - u8 *src, *dst; - gfp_t gfp; - int ret; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1533,24 +1511,8 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - if (!tree) - return false; - - /* - * If this is a duplicate, it must be removed before attempting to store - * it, otherwise, if the store fails the old page won't be removed from - * the tree, and it might be written back overriding the new data. - */ - spin_lock(&tree->lock); - dupentry = zswap_rb_search(&tree->rbroot, offset); - if (dupentry) { - zswap_duplicate_entry++; - zswap_invalidate_entry(tree, dupentry); - } - spin_unlock(&tree->lock); - if (!zswap_enabled) - return false; + goto check_old; objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { @@ -1577,17 +1539,19 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); + entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } if (zswap_same_filled_pages_enabled) { - src = kmap_local_page(page); + unsigned long value; + u8 *src; + + src = kmap_local_folio(folio, 0); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); - entry->swpentry = swp_entry(type, offset); entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1606,74 +1570,18 @@ bool zswap_store(struct folio *folio) if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { + if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } mem_cgroup_put(memcg); } - /* compress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); - - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - - if (ret) { - zswap_reject_compress_fail++; - goto put_dstmem; - } - - /* store */ - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto put_dstmem; - } - if (ret) { - zswap_reject_alloc_fail++; - goto put_dstmem; - } - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - mutex_unlock(&acomp_ctx->mutex); - - /* populate entry */ - entry->swpentry = swp_entry(type, offset); - entry->handle = handle; - entry->length = dlen; + if (!zswap_compress(folio, entry)) + goto put_pool; insert_entry: + entry->swpentry = swp; entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); @@ -1684,20 +1592,17 @@ insert_entry: /* map */ spin_lock(&tree->lock); /* - * A duplicate entry should have been removed at the beginning of this - * function. Since the swap entry should be pinned, if a duplicate is - * found again here it means that something went wrong in the swap - * cache. + * The folio may have been dirtied again, invalidate the + * possibly stale entry before inserting the new entry. */ - while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - WARN_ON(1); - zswap_duplicate_entry++; + if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { zswap_invalidate_entry(tree, dupentry); + WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); } if (entry->length) { INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&entry->pool->list_lru, entry); - atomic_inc(&entry->pool->nr_stored); + zswap_lru_add(&zswap_list_lru, entry); + atomic_inc(&zswap_nr_stored); } spin_unlock(&tree->lock); @@ -1708,8 +1613,6 @@ insert_entry: return true; -put_dstmem: - mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: @@ -1717,38 +1620,60 @@ freepage: reject: if (objcg) obj_cgroup_put(objcg); +check_old: + /* + * If the zswap store fails or zswap is disabled, we must invalidate the + * possibly stale entry which was previously stored at this offset. + * Otherwise, writeback could overwrite the new data in the swapfile. + */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (entry) + zswap_invalidate_entry(tree, entry); + spin_unlock(&tree->lock); return false; shrink: - pool = zswap_pool_last_get(); - if (pool && !queue_work(shrink_wq, &pool->shrink_work)) - zswap_pool_put(pool); + queue_work(shrink_wq, &zswap_shrink_work); goto reject; } bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + bool swapcache = folio_test_swapcache(folio); + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); - /* find */ spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); + entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { spin_unlock(&tree->lock); return false; } + /* + * When reading into the swapcache, invalidate our entry. The + * swapcache can be the authoritative owner of the page and + * its mappings, and the pressure that results from having two + * in-memory copies outweighs any benefits of caching the + * compression work. + * + * (Most swapins go through the swapcache. The notable + * exception is the singleton fault on SWP_SYNCHRONOUS_IO + * files, which reads into a private page and may free it if + * the fault fails. We remain the primary owner of the entry.) + */ + if (swapcache) + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); if (entry->length) - __zswap_load(entry, page); + zswap_decompress(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); @@ -1759,67 +1684,64 @@ bool zswap_load(struct folio *folio) if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); - spin_lock(&tree->lock); - if (zswap_exclusive_loads_enabled) { - zswap_invalidate_entry(tree, entry); + if (swapcache) { + zswap_entry_free(entry); folio_mark_dirty(folio); - } else if (entry->length) { - zswap_lru_del(&entry->pool->list_lru, entry); - zswap_lru_add(&entry->pool->list_lru, entry); } - zswap_entry_put(tree, entry); - spin_unlock(&tree->lock); return true; } -void zswap_invalidate(int type, pgoff_t offset) +void zswap_invalidate(swp_entry_t swp) { - struct zswap_tree *tree = zswap_trees[type]; + pgoff_t offset = swp_offset(swp); + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - /* find */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - /* entry was written back */ - spin_unlock(&tree->lock); - return; - } - zswap_invalidate_entry(tree, entry); + if (entry) + zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } -void zswap_swapon(int type) +int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *tree; + struct zswap_tree *trees, *tree; + unsigned int nr, i; - tree = kzalloc(sizeof(*tree), GFP_KERNEL); - if (!tree) { + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); + if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); - return; + return -ENOMEM; } - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - zswap_trees[type] = tree; + for (i = 0; i < nr; i++) { + tree = trees + i; + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + } + + nr_zswap_trees[type] = nr; + zswap_trees[type] = trees; + return 0; } void zswap_swapoff(int type) { - struct zswap_tree *tree = zswap_trees[type]; - struct zswap_entry *entry, *n; + struct zswap_tree *trees = zswap_trees[type]; + unsigned int i; - if (!tree) + if (!trees) return; - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - kfree(tree); + /* try_to_unuse() invalidated all the entries already */ + for (i = 0; i < nr_zswap_trees[type]; i++) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); + + kvfree(trees); + nr_zswap_trees[type] = 0; zswap_trees[type] = NULL; } @@ -1852,8 +1774,6 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("duplicate_entry", 0444, - zswap_debugfs_root, &zswap_duplicate_entry); debugfs_create_u64("pool_total_size", 0444, zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", 0444, @@ -1891,6 +1811,20 @@ static int zswap_setup(void) if (ret) goto hp_fail; + shrink_wq = alloc_workqueue("zswap-shrink", + WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + if (!shrink_wq) + goto shrink_wq_fail; + + zswap_shrinker = zswap_alloc_shrinker(); + if (!zswap_shrinker) + goto shrinker_fail; + if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) + goto lru_fail; + shrinker_register(zswap_shrinker); + + INIT_WORK(&zswap_shrink_work, shrink_worker); + pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, @@ -1902,18 +1836,17 @@ static int zswap_setup(void) zswap_enabled = false; } - shrink_wq = create_workqueue("zswap-shrink"); - if (!shrink_wq) - goto fallback_fail; - if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; -fallback_fail: - if (pool) - zswap_pool_destroy(pool); +lru_fail: + shrinker_free(zswap_shrinker); +shrinker_fail: + destroy_workqueue(shrink_wq); +shrink_wq_fail: + cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: |