summaryrefslogtreecommitdiffstats
path: root/mm/zswap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/zswap.c')
-rw-r--r--mm/zswap.c769
1 files changed, 522 insertions, 247 deletions
diff --git a/mm/zswap.c b/mm/zswap.c
index 870fd6f5a5..69766f2c5a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -35,6 +35,7 @@
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/workqueue.h>
+#include <linux/list_lru.h>
#include "swap.h"
#include "internal.h"
@@ -147,6 +148,16 @@ module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
/* Number of zpools in zswap_pool (empirically determined for scalability) */
#define ZSWAP_NR_ZPOOLS 32
+/* Enable/disable memory pressure-based shrinker. */
+static bool zswap_shrinker_enabled = IS_ENABLED(
+ CONFIG_ZSWAP_SHRINKER_DEFAULT_ON);
+module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644);
+
+bool is_zswap_enabled(void)
+{
+ return zswap_enabled;
+}
+
/*********************************
* data structures
**********************************/
@@ -155,8 +166,8 @@ struct crypto_acomp_ctx {
struct crypto_acomp *acomp;
struct acomp_req *req;
struct crypto_wait wait;
- u8 *dstmem;
- struct mutex *mutex;
+ u8 *buffer;
+ struct mutex mutex;
};
/*
@@ -174,8 +185,10 @@ struct zswap_pool {
struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
- struct list_head lru;
- spinlock_t lru_lock;
+ struct list_lru list_lru;
+ struct mem_cgroup *next_shrink;
+ struct shrinker *shrinker;
+ atomic_t nr_stored;
};
/*
@@ -274,32 +287,72 @@ static bool zswap_can_accept(void)
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
+static u64 get_zswap_pool_size(struct zswap_pool *pool)
+{
+ u64 pool_size = 0;
+ int i;
+
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ pool_size += zpool_get_total_size(pool->zpools[i]);
+
+ return pool_size;
+}
+
static void zswap_update_total_size(void)
{
struct zswap_pool *pool;
u64 total = 0;
- int i;
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list)
- for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
- total += zpool_get_total_size(pool->zpools[i]);
+ total += get_zswap_pool_size(pool);
rcu_read_unlock();
zswap_pool_total_size = total;
}
+/* should be called under RCU */
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+ return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
+}
+#else
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+ return NULL;
+}
+#endif
+
+static inline int entry_to_nid(struct zswap_entry *entry)
+{
+ return page_to_nid(virt_to_page(entry));
+}
+
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
+{
+ struct zswap_pool *pool;
+
+ /* lock out zswap pools list modification */
+ spin_lock(&zswap_pools_lock);
+ list_for_each_entry(pool, &zswap_pools, list) {
+ if (pool->next_shrink == memcg)
+ pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+ }
+ spin_unlock(&zswap_pools_lock);
+}
+
/*********************************
* zswap entry functions
**********************************/
static struct kmem_cache *zswap_entry_cache;
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
{
struct zswap_entry *entry;
- entry = kmem_cache_alloc(zswap_entry_cache, gfp);
+ entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
if (!entry)
return NULL;
entry->refcount = 1;
@@ -313,6 +366,99 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
}
/*********************************
+* zswap lruvec functions
+**********************************/
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+ atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_folio_swapin(struct folio *folio)
+{
+ struct lruvec *lruvec;
+
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+ lruvec = folio_lruvec(folio);
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+}
+
+/*********************************
+* lru functions
+**********************************/
+static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+ atomic_long_t *nr_zswap_protected;
+ unsigned long lru_size, old, new;
+ int nid = entry_to_nid(entry);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ /*
+ * Note that it is safe to use rcu_read_lock() here, even in the face of
+ * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
+ * used in list_lru lookup, only two scenarios are possible:
+ *
+ * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
+ * new entry will be reparented to memcg's parent's list_lru.
+ * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
+ * new entry will be added directly to memcg's parent's list_lru.
+ *
+ * Similar reasoning holds for list_lru_del() and list_lru_putback().
+ */
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ /* will always succeed */
+ list_lru_add(list_lru, &entry->lru, nid, memcg);
+
+ /* Update the protection area */
+ lru_size = list_lru_count_one(list_lru, nid, memcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
+ old = atomic_long_inc_return(nr_zswap_protected);
+ /*
+ * Decay to avoid overflow and adapt to changing workloads.
+ * This is based on LRU reclaim cost decaying heuristics.
+ */
+ do {
+ new = old > lru_size / 4 ? old / 2 : old;
+ } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
+ rcu_read_unlock();
+}
+
+static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+ int nid = entry_to_nid(entry);
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ /* will always succeed */
+ list_lru_del(list_lru, &entry->lru, nid, memcg);
+ rcu_read_unlock();
+}
+
+static void zswap_lru_putback(struct list_lru *list_lru,
+ struct zswap_entry *entry)
+{
+ int nid = entry_to_nid(entry);
+ spinlock_t *lock = &list_lru->node[nid].lock;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ spin_lock(lock);
+ /* we cannot use list_lru_add here, because it increments node's lru count */
+ list_lru_putback(list_lru, &entry->lru, nid, memcg);
+ spin_unlock(lock);
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
+ /* increment the protection area to account for the LRU rotation. */
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ rcu_read_unlock();
+}
+
+/*********************************
* rbtree functions
**********************************/
static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
@@ -389,19 +535,18 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
*/
static void zswap_free_entry(struct zswap_entry *entry)
{
- if (entry->objcg) {
- obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
- obj_cgroup_put(entry->objcg);
- }
if (!entry->length)
atomic_dec(&zswap_same_filled_pages);
else {
- spin_lock(&entry->pool->lru_lock);
- list_del(&entry->lru);
- spin_unlock(&entry->pool->lru_lock);
+ zswap_lru_del(&entry->pool->list_lru, entry);
zpool_free(zswap_find_zpool(entry), entry->handle);
+ atomic_dec(&entry->pool->nr_stored);
zswap_pool_put(entry->pool);
}
+ if (entry->objcg) {
+ obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
+ obj_cgroup_put(entry->objcg);
+ }
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
zswap_update_total_size();
@@ -442,65 +587,147 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
}
/*********************************
-* per-cpu code
+* shrinker functions
**********************************/
-static DEFINE_PER_CPU(u8 *, zswap_dstmem);
-/*
- * If users dynamically change the zpool type and compressor at runtime, i.e.
- * zswap is running, zswap can have more than one zpool on one cpu, but they
- * are sharing dtsmem. So we need this mutex to be per-cpu.
- */
-static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+ spinlock_t *lock, void *arg);
-static int zswap_dstmem_prepare(unsigned int cpu)
+static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
{
- struct mutex *mutex;
- u8 *dst;
+ struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ unsigned long shrink_ret, nr_protected, lru_size;
+ struct zswap_pool *pool = shrinker->private_data;
+ bool encountered_page_in_swapcache = false;
+
+ if (!zswap_shrinker_enabled ||
+ !mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
+ }
- dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!dst)
- return -ENOMEM;
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ lru_size = list_lru_shrink_count(&pool->list_lru, sc);
- mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
- if (!mutex) {
- kfree(dst);
- return -ENOMEM;
+ /*
+ * Abort if we are shrinking into the protected region.
+ *
+ * This short-circuiting is necessary because if we have too many multiple
+ * concurrent reclaimers getting the freeable zswap object counts at the
+ * same time (before any of them made reasonable progress), the total
+ * number of reclaimed objects might be more than the number of unprotected
+ * objects (i.e the reclaimers will reclaim into the protected area of the
+ * zswap LRU).
+ */
+ if (nr_protected >= lru_size - sc->nr_to_scan) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
}
- mutex_init(mutex);
- per_cpu(zswap_dstmem, cpu) = dst;
- per_cpu(zswap_mutex, cpu) = mutex;
- return 0;
+ shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
+ &encountered_page_in_swapcache);
+
+ if (encountered_page_in_swapcache)
+ return SHRINK_STOP;
+
+ return shrink_ret ? shrink_ret : SHRINK_STOP;
}
-static int zswap_dstmem_dead(unsigned int cpu)
+static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
{
- struct mutex *mutex;
- u8 *dst;
+ struct zswap_pool *pool = shrinker->private_data;
+ struct mem_cgroup *memcg = sc->memcg;
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
+ unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+
+ if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
+ return 0;
+
+ /*
+ * The shrinker resumes swap writeback, which will enter block
+ * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS
+ * rules (may_enter_fs()), which apply on a per-folio basis.
+ */
+ if (!gfp_has_io_fs(sc->gfp_mask))
+ return 0;
- mutex = per_cpu(zswap_mutex, cpu);
- kfree(mutex);
- per_cpu(zswap_mutex, cpu) = NULL;
+ /*
+ * For memcg, use the cgroup-wide ZSWAP stats since we don't
+ * have them per-node and thus per-lruvec. Careful if memcg is
+ * runtime-disabled: we can get sc->memcg == NULL, which is ok
+ * for the lruvec, but not for memcg_page_state().
+ *
+ * Without memcg, use the zswap pool-wide metrics.
+ */
+ if (!mem_cgroup_disabled()) {
+ mem_cgroup_flush_stats(memcg);
+ nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
+ nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
+ } else {
+ nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
+ nr_stored = atomic_read(&pool->nr_stored);
+ }
- dst = per_cpu(zswap_dstmem, cpu);
- kfree(dst);
- per_cpu(zswap_dstmem, cpu) = NULL;
+ if (!nr_stored)
+ return 0;
- return 0;
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
+ /*
+ * Subtract the lru size by an estimate of the number of pages
+ * that should be protected.
+ */
+ nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+
+ /*
+ * Scale the number of freeable pages by the memory saving factor.
+ * This ensures that the better zswap compresses memory, the fewer
+ * pages we will evict to swap (as it will otherwise incur IO for
+ * relatively small memory saving).
+ */
+ return mult_frac(nr_freeable, nr_backing, nr_stored);
}
+static void zswap_alloc_shrinker(struct zswap_pool *pool)
+{
+ pool->shrinker =
+ shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
+ if (!pool->shrinker)
+ return;
+
+ pool->shrinker->private_data = pool;
+ pool->shrinker->scan_objects = zswap_shrinker_scan;
+ pool->shrinker->count_objects = zswap_shrinker_count;
+ pool->shrinker->batch = 0;
+ pool->shrinker->seeks = DEFAULT_SEEKS;
+}
+
+/*********************************
+* per-cpu code
+**********************************/
static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
struct crypto_acomp *acomp;
struct acomp_req *req;
+ int ret;
+
+ mutex_init(&acomp_ctx->mutex);
+
+ acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffer)
+ return -ENOMEM;
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
- return PTR_ERR(acomp);
+ ret = PTR_ERR(acomp);
+ goto acomp_fail;
}
acomp_ctx->acomp = acomp;
@@ -508,8 +735,8 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
- crypto_free_acomp(acomp_ctx->acomp);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto req_fail;
}
acomp_ctx->req = req;
@@ -522,10 +749,13 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
- acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
- acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
-
return 0;
+
+req_fail:
+ crypto_free_acomp(acomp_ctx->acomp);
+acomp_fail:
+ kfree(acomp_ctx->buffer);
+ return ret;
}
static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
@@ -538,6 +768,7 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
acomp_request_free(acomp_ctx->req);
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
crypto_free_acomp(acomp_ctx->acomp);
+ kfree(acomp_ctx->buffer);
}
return 0;
@@ -632,21 +863,16 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
zswap_entry_put(tree, entry);
}
-static int zswap_reclaim_entry(struct zswap_pool *pool)
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+ spinlock_t *lock, void *arg)
{
- struct zswap_entry *entry;
+ struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+ bool *encountered_page_in_swapcache = (bool *)arg;
struct zswap_tree *tree;
pgoff_t swpoffset;
- int ret;
+ enum lru_status ret = LRU_REMOVED_RETRY;
+ int writeback_result;
- /* Get an entry off the LRU */
- spin_lock(&pool->lru_lock);
- if (list_empty(&pool->lru)) {
- spin_unlock(&pool->lru_lock);
- return -EINVAL;
- }
- entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
- list_del_init(&entry->lru);
/*
* Once the lru lock is dropped, the entry might get freed. The
* swpoffset is copied to the stack, and entry isn't deref'd again
@@ -654,29 +880,46 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
*/
swpoffset = swp_offset(entry->swpentry);
tree = zswap_trees[swp_type(entry->swpentry)];
- spin_unlock(&pool->lru_lock);
+ list_lru_isolate(l, item);
+ /*
+ * It's safe to drop the lock here because we return either
+ * LRU_REMOVED_RETRY or LRU_RETRY.
+ */
+ spin_unlock(lock);
/* Check for invalidate() race */
spin_lock(&tree->lock);
- if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
- ret = -EAGAIN;
+ if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
goto unlock;
- }
+
/* Hold a reference to prevent a free during writeback */
zswap_entry_get(entry);
spin_unlock(&tree->lock);
- ret = zswap_writeback_entry(entry, tree);
+ writeback_result = zswap_writeback_entry(entry, tree);
spin_lock(&tree->lock);
- if (ret) {
- /* Writeback failed, put entry back on LRU */
- spin_lock(&pool->lru_lock);
- list_move(&entry->lru, &pool->lru);
- spin_unlock(&pool->lru_lock);
+ if (writeback_result) {
+ zswap_reject_reclaim_fail++;
+ zswap_lru_putback(&entry->pool->list_lru, entry);
+ ret = LRU_RETRY;
+
+ /*
+ * Encountering a page already in swap cache is a sign that we are shrinking
+ * into the warmer region. We should terminate shrinking (if we're in the dynamic
+ * shrinker context).
+ */
+ if (writeback_result == -EEXIST && encountered_page_in_swapcache)
+ *encountered_page_in_swapcache = true;
+
goto put_unlock;
}
+ zswap_written_back_pages++;
+
+ if (entry->objcg)
+ count_objcg_event(entry->objcg, ZSWPWB);
+ count_vm_event(ZSWPWB);
/*
* Writeback started successfully, the page now belongs to the
* swapcache. Drop the entry from zswap - unless invalidate already
@@ -689,24 +932,94 @@ put_unlock:
zswap_entry_put(tree, entry);
unlock:
spin_unlock(&tree->lock);
- return ret ? -EAGAIN : 0;
+ spin_lock(lock);
+ return ret;
+}
+
+static int shrink_memcg(struct mem_cgroup *memcg)
+{
+ struct zswap_pool *pool;
+ int nid, shrunk = 0;
+
+ if (!mem_cgroup_zswap_writeback_enabled(memcg))
+ return -EINVAL;
+
+ /*
+ * Skip zombies because their LRUs are reparented and we would be
+ * reclaiming from the parent instead of the dead memcg.
+ */
+ if (memcg && !mem_cgroup_online(memcg))
+ return -ENOENT;
+
+ pool = zswap_pool_current_get();
+ if (!pool)
+ return -EINVAL;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
+ unsigned long nr_to_walk = 1;
+
+ shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
+ &shrink_memcg_cb, NULL, &nr_to_walk);
+ }
+ zswap_pool_put(pool);
+ return shrunk ? 0 : -EAGAIN;
}
static void shrink_worker(struct work_struct *w)
{
struct zswap_pool *pool = container_of(w, typeof(*pool),
shrink_work);
+ struct mem_cgroup *memcg;
int ret, failures = 0;
+ /* global reclaim will select cgroup in a round-robin fashion. */
do {
- ret = zswap_reclaim_entry(pool);
- if (ret) {
- zswap_reject_reclaim_fail++;
- if (ret != -EAGAIN)
+ spin_lock(&zswap_pools_lock);
+ pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+ memcg = pool->next_shrink;
+
+ /*
+ * We need to retry if we have gone through a full round trip, or if we
+ * got an offline memcg (or else we risk undoing the effect of the
+ * zswap memcg offlining cleanup callback). This is not catastrophic
+ * per se, but it will keep the now offlined memcg hostage for a while.
+ *
+ * Note that if we got an online memcg, we will keep the extra
+ * reference in case the original reference obtained by mem_cgroup_iter
+ * is dropped by the zswap memcg offlining callback, ensuring that the
+ * memcg is not killed when we are reclaiming.
+ */
+ if (!memcg) {
+ spin_unlock(&zswap_pools_lock);
+ if (++failures == MAX_RECLAIM_RETRIES)
break;
+
+ goto resched;
+ }
+
+ if (!mem_cgroup_tryget_online(memcg)) {
+ /* drop the reference from mem_cgroup_iter() */
+ mem_cgroup_iter_break(NULL, memcg);
+ pool->next_shrink = NULL;
+ spin_unlock(&zswap_pools_lock);
+
if (++failures == MAX_RECLAIM_RETRIES)
break;
+
+ goto resched;
}
+ spin_unlock(&zswap_pools_lock);
+
+ ret = shrink_memcg(memcg);
+ /* drop the extra reference */
+ mem_cgroup_put(memcg);
+
+ if (ret == -EINVAL)
+ break;
+ if (ret && ++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+resched:
cond_resched();
} while (!zswap_can_accept());
zswap_pool_put(pool);
@@ -760,6 +1073,11 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
&pool->node);
if (ret)
goto error;
+
+ zswap_alloc_shrinker(pool);
+ if (!pool->shrinker)
+ goto error;
+
pr_debug("using %s compressor\n", pool->tfm_name);
/* being the current pool takes 1 ref; this func expects the
@@ -767,14 +1085,19 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
- INIT_LIST_HEAD(&pool->lru);
- spin_lock_init(&pool->lru_lock);
+ if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
+ goto lru_fail;
+ shrinker_register(pool->shrinker);
INIT_WORK(&pool->shrink_work, shrink_worker);
+ atomic_set(&pool->nr_stored, 0);
zswap_pool_debug("created", pool);
return pool;
+lru_fail:
+ list_lru_destroy(&pool->list_lru);
+ shrinker_free(pool->shrinker);
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
@@ -832,8 +1155,16 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
zswap_pool_debug("destroying", pool);
+ shrinker_free(pool->shrinker);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);
+ list_lru_destroy(&pool->list_lru);
+
+ spin_lock(&zswap_pools_lock);
+ mem_cgroup_iter_break(NULL, pool->next_shrink);
+ pool->next_shrink = NULL;
+ spin_unlock(&zswap_pools_lock);
+
for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
zpool_destroy_pool(pool->zpools[i]);
kfree(pool);
@@ -1040,18 +1371,47 @@ static int zswap_enabled_param_set(const char *val,
return ret;
}
+static void __zswap_load(struct zswap_entry *entry, struct page *page)
+{
+ struct zpool *zpool = zswap_find_zpool(entry);
+ struct scatterlist input, output;
+ struct crypto_acomp_ctx *acomp_ctx;
+ u8 *src;
+
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+
+ src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+ if (!zpool_can_sleep_mapped(zpool)) {
+ memcpy(acomp_ctx->buffer, src, entry->length);
+ src = acomp_ctx->buffer;
+ zpool_unmap_handle(zpool, entry->handle);
+ }
+
+ sg_init_one(&input, src, entry->length);
+ sg_init_table(&output, 1);
+ sg_set_page(&output, page, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
+ BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
+ BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
+ mutex_unlock(&acomp_ctx->mutex);
+
+ if (zpool_can_sleep_mapped(zpool))
+ zpool_unmap_handle(zpool, entry->handle);
+}
+
/*********************************
* writeback code
**********************************/
/*
- * Attempts to free an entry by adding a page to the swap cache,
- * decompressing the entry data into the page, and issuing a
- * bio write to write the page back to the swap device.
+ * Attempts to free an entry by adding a folio to the swap cache,
+ * decompressing the entry data into the folio, and issuing a
+ * bio write to write the folio back to the swap device.
*
- * This can be thought of as a "resumed writeback" of the page
+ * This can be thought of as a "resumed writeback" of the folio
* to the swap device. We are basically resuming the same swap
* writeback path that was intercepted with the zswap_store()
- * in the first place. After the page has been decompressed into
+ * in the first place. After the folio has been decompressed into
* the swap cache, the compressed version stored by zswap can be
* freed.
*/
@@ -1059,110 +1419,60 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct zswap_tree *tree)
{
swp_entry_t swpentry = entry->swpentry;
- struct page *page;
+ struct folio *folio;
struct mempolicy *mpol;
- struct scatterlist input, output;
- struct crypto_acomp_ctx *acomp_ctx;
- struct zpool *pool = zswap_find_zpool(entry);
- bool page_was_allocated;
- u8 *src, *tmp = NULL;
- unsigned int dlen;
- int ret;
+ bool folio_was_allocated;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
- if (!zpool_can_sleep_mapped(pool)) {
- tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
- }
-
- /* try to allocate swap cache page */
+ /* try to allocate swap cache folio */
mpol = get_task_policy(current);
- page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
- NO_INTERLEAVE_INDEX, &page_was_allocated);
- if (!page) {
- ret = -ENOMEM;
- goto fail;
- }
+ folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
+ NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ if (!folio)
+ return -ENOMEM;
- /* Found an existing page, we raced with load/swapin */
- if (!page_was_allocated) {
- put_page(page);
- ret = -EEXIST;
- goto fail;
+ /*
+ * Found an existing folio, we raced with load/swapin. We generally
+ * writeback cold folios from zswap, and swapin means the folio just
+ * became hot. Skip this folio and let the caller find another one.
+ */
+ if (!folio_was_allocated) {
+ folio_put(folio);
+ return -EEXIST;
}
/*
- * Page is locked, and the swapcache is now secured against
+ * folio is locked, and the swapcache is now secured against
* concurrent swapping to and from the slot. Verify that the
* swap entry hasn't been invalidated and recycled behind our
* backs (our zswap_entry reference doesn't prevent that), to
- * avoid overwriting a new swap page with old compressed data.
+ * avoid overwriting a new swap folio with old compressed data.
*/
spin_lock(&tree->lock);
if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
- delete_from_swap_cache(page_folio(page));
- unlock_page(page);
- put_page(page);
- ret = -ENOMEM;
- goto fail;
+ delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ return -ENOMEM;
}
spin_unlock(&tree->lock);
- /* decompress */
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- dlen = PAGE_SIZE;
+ __zswap_load(entry, &folio->page);
- src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
- if (!zpool_can_sleep_mapped(pool)) {
- memcpy(tmp, src, entry->length);
- src = tmp;
- zpool_unmap_handle(pool, entry->handle);
- }
-
- mutex_lock(acomp_ctx->mutex);
- sg_init_one(&input, src, entry->length);
- sg_init_table(&output, 1);
- sg_set_page(&output, page, PAGE_SIZE, 0);
- acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
- ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
- mutex_unlock(acomp_ctx->mutex);
-
- if (!zpool_can_sleep_mapped(pool))
- kfree(tmp);
- else
- zpool_unmap_handle(pool, entry->handle);
-
- BUG_ON(ret);
- BUG_ON(dlen != PAGE_SIZE);
-
- /* page is up to date */
- SetPageUptodate(page);
+ /* folio is up to date */
+ folio_mark_uptodate(folio);
/* move it to the tail of the inactive list after end_writeback */
- SetPageReclaim(page);
+ folio_set_reclaim(folio);
/* start writeback */
- __swap_writepage(page, &wbc);
- put_page(page);
- zswap_written_back_pages++;
+ __swap_writepage(folio, &wbc);
+ folio_put(folio);
- return ret;
-
-fail:
- if (!zpool_can_sleep_mapped(pool))
- kfree(tmp);
-
- /*
- * If we get here because the page is already in swapcache, a
- * load may be happening concurrently. It is safe and okay to
- * not free the entry. It is also okay to return !0.
- */
- return ret;
+ return 0;
}
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
@@ -1206,6 +1516,7 @@ bool zswap_store(struct folio *folio)
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
struct obj_cgroup *objcg = NULL;
+ struct mem_cgroup *memcg = NULL;
struct zswap_pool *pool;
struct zpool *zpool;
unsigned int dlen = PAGE_SIZE;
@@ -1241,14 +1552,15 @@ bool zswap_store(struct folio *folio)
if (!zswap_enabled)
return false;
- /*
- * XXX: zswap reclaim does not work with cgroups yet. Without a
- * cgroup-aware entry LRU, we will push out entries system-wide based on
- * local cgroup limits.
- */
objcg = get_obj_cgroup_from_folio(folio);
- if (objcg && !obj_cgroup_may_zswap(objcg))
- goto reject;
+ if (objcg && !obj_cgroup_may_zswap(objcg)) {
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ if (shrink_memcg(memcg)) {
+ mem_cgroup_put(memcg);
+ goto reject;
+ }
+ mem_cgroup_put(memcg);
+ }
/* reclaim space if needed */
if (zswap_is_full()) {
@@ -1265,23 +1577,23 @@ bool zswap_store(struct folio *folio)
}
/* allocate entry */
- entry = zswap_entry_cache_alloc(GFP_KERNEL);
+ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
if (!entry) {
zswap_reject_kmemcache_fail++;
goto reject;
}
if (zswap_same_filled_pages_enabled) {
- src = kmap_atomic(page);
+ src = kmap_local_page(page);
if (zswap_is_page_same_filled(src, &value)) {
- kunmap_atomic(src);
+ kunmap_local(src);
entry->swpentry = swp_entry(type, offset);
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
goto insert_entry;
}
- kunmap_atomic(src);
+ kunmap_local(src);
}
if (!zswap_non_same_filled_pages_enabled)
@@ -1292,16 +1604,29 @@ bool zswap_store(struct folio *folio)
if (!entry->pool)
goto freepage;
+ if (objcg) {
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) {
+ mem_cgroup_put(memcg);
+ goto put_pool;
+ }
+ mem_cgroup_put(memcg);
+ }
+
/* compress */
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- mutex_lock(acomp_ctx->mutex);
+ mutex_lock(&acomp_ctx->mutex);
- dst = acomp_ctx->dstmem;
+ dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
- sg_set_page(&input, page, PAGE_SIZE, 0);
+ sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
- /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
+ /*
+ * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+ * and hardware-accelerators may won't check the dst buffer size, so
+ * giving the dst buffer with enough length to avoid buffer overflow.
+ */
sg_init_one(&output, dst, PAGE_SIZE * 2);
acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
/*
@@ -1341,7 +1666,7 @@ bool zswap_store(struct folio *folio)
buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
memcpy(buf, dst, dlen);
zpool_unmap_handle(zpool, handle);
- mutex_unlock(acomp_ctx->mutex);
+ mutex_unlock(&acomp_ctx->mutex);
/* populate entry */
entry->swpentry = swp_entry(type, offset);
@@ -1370,9 +1695,9 @@ insert_entry:
zswap_invalidate_entry(tree, dupentry);
}
if (entry->length) {
- spin_lock(&entry->pool->lru_lock);
- list_add(&entry->lru, &entry->pool->lru);
- spin_unlock(&entry->pool->lru_lock);
+ INIT_LIST_HEAD(&entry->lru);
+ zswap_lru_add(&entry->pool->list_lru, entry);
+ atomic_inc(&entry->pool->nr_stored);
}
spin_unlock(&tree->lock);
@@ -1384,7 +1709,8 @@ insert_entry:
return true;
put_dstmem:
- mutex_unlock(acomp_ctx->mutex);
+ mutex_unlock(&acomp_ctx->mutex);
+put_pool:
zswap_pool_put(entry->pool);
freepage:
zswap_entry_cache_free(entry);
@@ -1408,12 +1734,7 @@ bool zswap_load(struct folio *folio)
struct page *page = &folio->page;
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
- struct scatterlist input, output;
- struct crypto_acomp_ctx *acomp_ctx;
- u8 *src, *dst, *tmp;
- struct zpool *zpool;
- unsigned int dlen;
- bool ret;
+ u8 *dst;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
@@ -1426,67 +1747,30 @@ bool zswap_load(struct folio *folio)
}
spin_unlock(&tree->lock);
- if (!entry->length) {
- dst = kmap_atomic(page);
+ if (entry->length)
+ __zswap_load(entry, page);
+ else {
+ dst = kmap_local_page(page);
zswap_fill_page(dst, entry->value);
- kunmap_atomic(dst);
- ret = true;
- goto stats;
- }
-
- zpool = zswap_find_zpool(entry);
- if (!zpool_can_sleep_mapped(zpool)) {
- tmp = kmalloc(entry->length, GFP_KERNEL);
- if (!tmp) {
- ret = false;
- goto freeentry;
- }
- }
-
- /* decompress */
- dlen = PAGE_SIZE;
- src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
-
- if (!zpool_can_sleep_mapped(zpool)) {
- memcpy(tmp, src, entry->length);
- src = tmp;
- zpool_unmap_handle(zpool, entry->handle);
+ kunmap_local(dst);
}
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- mutex_lock(acomp_ctx->mutex);
- sg_init_one(&input, src, entry->length);
- sg_init_table(&output, 1);
- sg_set_page(&output, page, PAGE_SIZE, 0);
- acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
- if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait))
- WARN_ON(1);
- mutex_unlock(acomp_ctx->mutex);
-
- if (zpool_can_sleep_mapped(zpool))
- zpool_unmap_handle(zpool, entry->handle);
- else
- kfree(tmp);
-
- ret = true;
-stats:
count_vm_event(ZSWPIN);
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPIN);
-freeentry:
+
spin_lock(&tree->lock);
- if (ret && zswap_exclusive_loads_enabled) {
+ if (zswap_exclusive_loads_enabled) {
zswap_invalidate_entry(tree, entry);
folio_mark_dirty(folio);
} else if (entry->length) {
- spin_lock(&entry->pool->lru_lock);
- list_move(&entry->lru, &entry->pool->lru);
- spin_unlock(&entry->pool->lru_lock);
+ zswap_lru_del(&entry->pool->list_lru, entry);
+ zswap_lru_add(&entry->pool->list_lru, entry);
}
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
- return ret;
+ return true;
}
void zswap_invalidate(int type, pgoff_t offset)
@@ -1600,13 +1884,6 @@ static int zswap_setup(void)
goto cache_fail;
}
- ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
- zswap_dstmem_prepare, zswap_dstmem_dead);
- if (ret) {
- pr_err("dstmem alloc failed\n");
- goto dstmem_fail;
- }
-
ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
"mm/zswap_pool:prepare",
zswap_cpu_comp_prepare,
@@ -1638,8 +1915,6 @@ fallback_fail:
if (pool)
zswap_pool_destroy(pool);
hp_fail:
- cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
-dstmem_fail:
kmem_cache_destroy(zswap_entry_cache);
cache_fail:
/* if built-in, we aren't unloaded on failure; don't allow use */