diff options
Diffstat (limited to 'mm/page_owner.c')
-rw-r--r-- | mm/page_owner.c | 377 |
1 files changed, 311 insertions, 66 deletions
diff --git a/mm/page_owner.c b/mm/page_owner.c index 5634e5d890..8eed0f3dc0 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -36,6 +36,15 @@ struct page_owner { pid_t free_tgid; }; +struct stack { + struct stack_record *stack_record; + struct stack *next; +}; +static struct stack dummy_stack; +static struct stack failure_stack; +static struct stack *stack_list; +static DEFINE_SPINLOCK(stack_list_lock); + static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -45,6 +54,22 @@ static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); +static inline void set_current_in_page_owner(void) +{ + /* + * Avoid recursion. + * + * We might need to allocate more memory from page_owner code, so make + * sure to signal it in order to avoid recursion. + */ + current->in_page_owner = 1; +} + +static inline void unset_current_in_page_owner(void) +{ + current->in_page_owner = 0; +} + static int __init early_page_owner_param(char *buf) { int ret = kstrtobool(buf, &page_owner_enabled); @@ -93,8 +118,17 @@ static __init void init_page_owner(void) register_dummy_stack(); register_failure_stack(); register_early_stack(); - static_branch_enable(&page_owner_inited); init_early_allocated_pages(); + /* Initialize dummy and failure stacks and link them to stack_list */ + dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); + failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle); + if (dummy_stack.stack_record) + refcount_set(&dummy_stack.stack_record->count, 1); + if (failure_stack.stack_record) + refcount_set(&failure_stack.stack_record->count, 1); + dummy_stack.next = &failure_stack; + stack_list = &dummy_stack; + static_branch_enable(&page_owner_inited); } struct page_ext_operations page_owner_ops = { @@ -115,81 +149,177 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) depot_stack_handle_t handle; unsigned int nr_entries; - /* - * Avoid recursion. - * - * Sometimes page metadata allocation tracking requires more - * memory to be allocated: - * - when new stack trace is saved to stack depot - */ if (current->in_page_owner) return dummy_handle; - current->in_page_owner = 1; + set_current_in_page_owner(); nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; + unset_current_in_page_owner(); - current->in_page_owner = 0; return handle; } -void __reset_page_owner(struct page *page, unsigned short order) +static void add_stack_record_to_list(struct stack_record *stack_record, + gfp_t gfp_mask) { - int i; - struct page_ext *page_ext; - depot_stack_handle_t handle; - struct page_owner *page_owner; - u64 free_ts_nsec = local_clock(); + unsigned long flags; + struct stack *stack; + + /* Filter gfp_mask the same way stackdepot does, for consistency */ + gfp_mask &= ~GFP_ZONEMASK; + gfp_mask &= (GFP_ATOMIC | GFP_KERNEL | __GFP_NOLOCKDEP); + gfp_mask |= __GFP_NOWARN; + + set_current_in_page_owner(); + stack = kmalloc(sizeof(*stack), gfp_mask); + if (!stack) { + unset_current_in_page_owner(); + return; + } + unset_current_in_page_owner(); - page_ext = page_ext_get(page); - if (unlikely(!page_ext)) + stack->stack_record = stack_record; + stack->next = NULL; + + spin_lock_irqsave(&stack_list_lock, flags); + stack->next = stack_list; + /* + * This pairs with smp_load_acquire() from function + * stack_start(). This guarantees that stack_start() + * will see an updated stack_list before starting to + * traverse the list. + */ + smp_store_release(&stack_list, stack); + spin_unlock_irqrestore(&stack_list_lock, flags); +} + +static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, + int nr_base_pages) +{ + struct stack_record *stack_record = __stack_depot_get_stack_record(handle); + + if (!stack_record) return; - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); - for (i = 0; i < (1 << order); i++) { - __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_owner = get_page_owner(page_ext); - page_owner->free_handle = handle; - page_owner->free_ts_nsec = free_ts_nsec; - page_owner->free_pid = current->pid; - page_owner->free_tgid = current->tgid; - page_ext = page_ext_next(page_ext); + /* + * New stack_record's that do not use STACK_DEPOT_FLAG_GET start + * with REFCOUNT_SATURATED to catch spurious increments of their + * refcount. + * Since we do not use STACK_DEPOT_FLAG_GET API, let us + * set a refcount of 1 ourselves. + */ + if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) { + int old = REFCOUNT_SATURATED; + + if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1)) + /* Add the new stack_record to our list */ + add_stack_record_to_list(stack_record, gfp_mask); } - page_ext_put(page_ext); + refcount_add(nr_base_pages, &stack_record->count); } -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, - unsigned short order, gfp_t gfp_mask) +static void dec_stack_record_count(depot_stack_handle_t handle, + int nr_base_pages) +{ + struct stack_record *stack_record = __stack_depot_get_stack_record(handle); + + if (!stack_record) + return; + + if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) + pr_warn("%s: refcount went to 0 for %u handle\n", __func__, + handle); +} + +static inline void __update_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + gfp_t gfp_mask, + short last_migrate_reason, u64 ts_nsec, + pid_t pid, pid_t tgid, char *comm) { - struct page_owner *page_owner; int i; - u64 ts_nsec = local_clock(); + struct page_owner *page_owner; for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); page_owner->handle = handle; page_owner->order = order; page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; - page_owner->pid = current->pid; - page_owner->tgid = current->tgid; + page_owner->last_migrate_reason = last_migrate_reason; + page_owner->pid = pid; + page_owner->tgid = tgid; page_owner->ts_nsec = ts_nsec; - strscpy(page_owner->comm, current->comm, + strscpy(page_owner->comm, comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } +} + +static inline void __update_page_owner_free_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + pid_t pid, pid_t tgid, + u64 free_ts_nsec) +{ + int i; + struct page_owner *page_owner; + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + /* Only __reset_page_owner() wants to clear the bit */ + if (handle) { + __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_owner->free_handle = handle; + } + page_owner->free_ts_nsec = free_ts_nsec; + page_owner->free_pid = current->pid; + page_owner->free_tgid = current->tgid; page_ext = page_ext_next(page_ext); } } +void __reset_page_owner(struct page *page, unsigned short order) +{ + struct page_ext *page_ext; + depot_stack_handle_t handle; + depot_stack_handle_t alloc_handle; + struct page_owner *page_owner; + u64 free_ts_nsec = local_clock(); + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + return; + + page_owner = get_page_owner(page_ext); + alloc_handle = page_owner->handle; + + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + __update_page_owner_free_handle(page_ext, handle, order, current->pid, + current->tgid, free_ts_nsec); + page_ext_put(page_ext); + + if (alloc_handle != early_handle) + /* + * early_handle is being set as a handle for all those + * early allocated pages. See init_pages_in_zone(). + * Since their refcount is not being incremented because + * the machinery is not ready yet, we cannot decrement + * their refcount either. + */ + dec_stack_record_count(alloc_handle, 1 << order); +} + noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { struct page_ext *page_ext; + u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); @@ -197,8 +327,11 @@ noinline void __set_page_owner(struct page *page, unsigned short order, page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, + ts_nsec, current->pid, current->tgid, + current->comm); page_ext_put(page_ext); + inc_stack_record_count(handle, gfp_mask, 1 << order); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -214,7 +347,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext_put(page_ext); } -void __split_page_owner(struct page *page, unsigned int nr) +void __split_page_owner(struct page *page, int old_order, int new_order) { int i; struct page_ext *page_ext = page_ext_get(page); @@ -223,9 +356,9 @@ void __split_page_owner(struct page *page, unsigned int nr) if (unlikely(!page_ext)) return; - for (i = 0; i < nr; i++) { + for (i = 0; i < (1 << old_order); i++) { page_owner = get_page_owner(page_ext); - page_owner->order = 0; + page_owner->order = new_order; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); @@ -233,9 +366,12 @@ void __split_page_owner(struct page *page, unsigned int nr) void __folio_copy_owner(struct folio *newfolio, struct folio *old) { + int i; struct page_ext *old_ext; struct page_ext *new_ext; - struct page_owner *old_page_owner, *new_page_owner; + struct page_owner *old_page_owner; + struct page_owner *new_page_owner; + depot_stack_handle_t migrate_handle; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -249,30 +385,32 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); - new_page_owner->order = old_page_owner->order; - new_page_owner->gfp_mask = old_page_owner->gfp_mask; - new_page_owner->last_migrate_reason = - old_page_owner->last_migrate_reason; - new_page_owner->handle = old_page_owner->handle; - new_page_owner->pid = old_page_owner->pid; - new_page_owner->tgid = old_page_owner->tgid; - new_page_owner->free_pid = old_page_owner->free_pid; - new_page_owner->free_tgid = old_page_owner->free_tgid; - new_page_owner->ts_nsec = old_page_owner->ts_nsec; - new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; - strcpy(new_page_owner->comm, old_page_owner->comm); - + migrate_handle = new_page_owner->handle; + __update_page_owner_handle(new_ext, old_page_owner->handle, + old_page_owner->order, old_page_owner->gfp_mask, + old_page_owner->last_migrate_reason, + old_page_owner->ts_nsec, old_page_owner->pid, + old_page_owner->tgid, old_page_owner->comm); /* - * We don't clear the bit on the old folio as it's going to be freed - * after migration. Until then, the info can be useful in case of - * a bug, and the overall stats will be off a bit only temporarily. - * Also, migrate_misplaced_transhuge_page() can still fail the - * migration and then we want the old folio to retain the info. But - * in that case we also don't need to explicitly clear the info from - * the new page, which will be freed. + * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio + * will be freed after migration. Keep them until then as they may be + * useful. */ - __set_bit(PAGE_EXT_OWNER, &new_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + __update_page_owner_free_handle(new_ext, 0, old_page_owner->order, + old_page_owner->free_pid, + old_page_owner->free_tgid, + old_page_owner->free_ts_nsec); + /* + * We linked the original stack to the new folio, we need to do the same + * for the new one and the old folio otherwise there will be an imbalance + * when subtracting those pages from the stack. + */ + for (i = 0; i < (1 << new_page_owner->order); i++) { + old_page_owner->handle = migrate_handle; + old_ext = page_ext_next(old_ext); + old_page_owner = get_page_owner(old_ext); + } + page_ext_put(new_ext); page_ext_put(old_ext); } @@ -680,8 +818,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) goto ext_put_continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, - 0, 0); + __update_page_owner_handle(page_ext, early_handle, 0, 0, + -1, local_clock(), current->pid, + current->tgid, current->comm); count++; ext_put_continue: page_ext_put(page_ext); @@ -719,8 +858,109 @@ static const struct file_operations proc_page_owner_operations = { .llseek = lseek_page_owner, }; +static void *stack_start(struct seq_file *m, loff_t *ppos) +{ + struct stack *stack; + + if (*ppos == -1UL) + return NULL; + + if (!*ppos) { + /* + * This pairs with smp_store_release() from function + * add_stack_record_to_list(), so we get a consistent + * value of stack_list. + */ + stack = smp_load_acquire(&stack_list); + m->private = stack; + } else { + stack = m->private; + } + + return stack; +} + +static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) +{ + struct stack *stack = v; + + stack = stack->next; + *ppos = stack ? *ppos + 1 : -1UL; + m->private = stack; + + return stack; +} + +static unsigned long page_owner_pages_threshold; + +static int stack_print(struct seq_file *m, void *v) +{ + int i, nr_base_pages; + struct stack *stack = v; + unsigned long *entries; + unsigned long nr_entries; + struct stack_record *stack_record = stack->stack_record; + + if (!stack->stack_record) + return 0; + + nr_entries = stack_record->size; + entries = stack_record->entries; + nr_base_pages = refcount_read(&stack_record->count) - 1; + + if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) + return 0; + + for (i = 0; i < nr_entries; i++) + seq_printf(m, " %pS\n", (void *)entries[i]); + seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); + + return 0; +} + +static void stack_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations page_owner_stack_op = { + .start = stack_start, + .next = stack_next, + .stop = stack_stop, + .show = stack_print +}; + +static int page_owner_stack_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &page_owner_stack_op, 0); +} + +static const struct file_operations page_owner_stack_operations = { + .open = page_owner_stack_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int page_owner_threshold_get(void *data, u64 *val) +{ + *val = READ_ONCE(page_owner_pages_threshold); + return 0; +} + +static int page_owner_threshold_set(void *data, u64 val) +{ + WRITE_ONCE(page_owner_pages_threshold, val); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get, + &page_owner_threshold_set, "%llu"); + + static int __init pageowner_init(void) { + struct dentry *dir; + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; @@ -728,6 +968,11 @@ static int __init pageowner_init(void) debugfs_create_file("page_owner", 0400, NULL, NULL, &proc_page_owner_operations); + dir = debugfs_create_dir("page_owner_stacks", NULL); + debugfs_create_file("show_stacks", 0400, dir, NULL, + &page_owner_stack_operations); + debugfs_create_file("count_threshold", 0600, dir, NULL, + &proc_page_owner_threshold); return 0; } |