// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2019 Sultan Alsawaf . */ #define pr_fmt(fmt) "simple_lmk: " fmt #include #include #include #include #include #include #include /* The sched_param struct is located elsewhere in newer kernels */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) #include #endif /* MIN_NICE isn't present and MAX_RT_PRIO is elsewhere in older kernels */ #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0) #include #define MIN_NICE -20 #endif /* SEND_SIG_FORCED isn't present in newer kernels */ #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0) #define SIG_INFO_TYPE SEND_SIG_FORCED #else #define SIG_INFO_TYPE SEND_SIG_PRIV #endif /* The group argument to do_send_sig_info is different in newer kernels */ #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) #define KILL_GROUP_TYPE true #else #define KILL_GROUP_TYPE PIDTYPE_TGID #endif /* The minimum number of pages to free per reclaim */ #define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE) /* Kill up to this many victims per reclaim */ #define MAX_VICTIMS 1024 struct victim_info { struct task_struct *tsk; struct mm_struct *mm; unsigned long size; }; /* Pulled from the Android framework. Lower adj means higher priority. */ static const short adj_prio[] = { 906, /* CACHED_APP_MAX_ADJ */ 905, /* Cached app */ 904, /* Cached app */ 903, /* Cached app */ 902, /* Cached app */ 901, /* Cached app */ 900, /* CACHED_APP_MIN_ADJ */ 800, /* SERVICE_B_ADJ */ 700, /* PREVIOUS_APP_ADJ */ 600, /* HOME_APP_ADJ */ 500, /* SERVICE_ADJ */ 400, /* HEAVY_WEIGHT_APP_ADJ */ 300, /* BACKUP_APP_ADJ */ 200, /* PERCEPTIBLE_APP_ADJ */ 100, /* VISIBLE_APP_ADJ */ 0 /* FOREGROUND_APP_ADJ */ }; static struct victim_info victims[MAX_VICTIMS]; static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); static DECLARE_COMPLETION(reclaim_done); static int victims_to_kill; static atomic_t needs_reclaim = ATOMIC_INIT(0); static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) { const struct victim_info *lhs = (typeof(lhs))lhs_ptr; const struct victim_info *rhs = (typeof(rhs))rhs_ptr; return rhs->size - lhs->size; } static bool vtsk_is_duplicate(struct victim_info *varr, int vlen, struct task_struct *vtsk) { int i; for (i = 0; i < vlen; i++) { if (same_thread_group(varr[i].tsk, vtsk)) return true; } return false; } static unsigned long find_victims(struct victim_info *varr, int *vindex, int vmaxlen, short target_adj) { unsigned long pages_found = 0; int old_vindex = *vindex; struct task_struct *tsk; for_each_process(tsk) { struct task_struct *vtsk; /* * Search for tasks with the targeted importance (adj). Since * only tasks with a positive adj can be targeted, that * naturally excludes tasks which shouldn't be killed, like init * and kthreads. Although oom_score_adj can still be changed * while this code runs, it doesn't really matter. We just need * to make sure that if the adj changes, we won't deadlock * trying to lock a task that we locked earlier. */ if (READ_ONCE(tsk->signal->oom_score_adj) != target_adj || vtsk_is_duplicate(varr, *vindex, tsk)) continue; vtsk = find_lock_task_mm(tsk); if (!vtsk) continue; /* Store this potential victim away for later */ varr[*vindex].tsk = vtsk; varr[*vindex].mm = vtsk->mm; varr[*vindex].size = get_mm_rss(vtsk->mm); /* Keep track of the number of pages that have been found */ pages_found += varr[*vindex].size; /* Make sure there's space left in the victim array */ if (++*vindex == vmaxlen) break; } /* * Sort the victims in descending order of size to prioritize killing * the larger ones first. */ if (pages_found) sort(&varr[old_vindex], *vindex - old_vindex, sizeof(*varr), victim_size_cmp, NULL); return pages_found; } static int process_victims(struct victim_info *varr, int vlen, unsigned long pages_needed) { unsigned long pages_found = 0; int i, nr_to_kill = 0; /* * Calculate the number of tasks that need to be killed and quickly * release the references to those that'll live. */ for (i = 0; i < vlen; i++) { struct victim_info *victim = &victims[i]; struct task_struct *vtsk = victim->tsk; /* The victim's mm lock is taken in find_victims; release it */ if (pages_found >= pages_needed) { task_unlock(vtsk); continue; } pages_found += victim->size; nr_to_kill++; } return nr_to_kill; } static void scan_and_kill(unsigned long pages_needed) { int i, nr_to_kill = 0, nr_victims = 0; unsigned long pages_found = 0; /* * Hold the tasklist lock so tasks don't disappear while scanning. This * is preferred to holding an RCU read lock so that the list of tasks * is guaranteed to be up to date. */ read_lock(&tasklist_lock); for (i = 0; i < ARRAY_SIZE(adj_prio); i++) { pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS, adj_prio[i]); if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS) break; } read_unlock(&tasklist_lock); /* Pretty unlikely but it can happen */ if (unlikely(!nr_victims)) return; /* First round of victim processing to weed out unneeded victims */ nr_to_kill = process_victims(victims, nr_victims, pages_needed); /* * Try to kill as few of the chosen victims as possible by sorting the * chosen victims by size, which means larger victims that have a lower * adj can be killed in place of smaller victims with a high adj. */ sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL); /* Second round of victim processing to finally select the victims */ nr_to_kill = process_victims(victims, nr_to_kill, pages_needed); /* Kill the victims */ WRITE_ONCE(victims_to_kill, nr_to_kill); for (i = 0; i < nr_to_kill; i++) { struct victim_info *victim = &victims[i]; struct task_struct *vtsk = victim->tsk; pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm, vtsk->signal->oom_score_adj, victim->size << (PAGE_SHIFT - 10)); /* Accelerate the victim's death by forcing the kill signal */ do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, KILL_GROUP_TYPE); /* Grab a reference to the victim for later before unlocking */ get_task_struct(vtsk); task_unlock(vtsk); } /* Try to speed up the death process now that we can schedule again */ for (i = 0; i < nr_to_kill; i++) { struct task_struct *vtsk = victims[i].tsk; /* Increase the victim's priority to make it die faster */ set_user_nice(vtsk, MIN_NICE); /* Allow the victim to run on any CPU */ set_cpus_allowed_ptr(vtsk, cpu_all_mask); /* Finally release the victim reference acquired earlier */ put_task_struct(vtsk); } /* Wait until all the victims die */ wait_for_completion(&reclaim_done); } static int simple_lmk_reclaim_thread(void *data) { static const struct sched_param sched_max_rt_prio = { .sched_priority = MAX_RT_PRIO - 1 }; sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); while (1) { wait_event(oom_waitq, atomic_read(&needs_reclaim)); /* * Kill a batch of processes and wait for their memory to be * freed. After their memory is freed, sleep for 20 ms to give * OOM'd allocations a chance to scavenge for the newly-freed * pages. Rinse and repeat while there are still OOM'd * allocations. */ do { scan_and_kill(MIN_FREE_PAGES); msleep(20); } while (atomic_read(&needs_reclaim)); } return 0; } void simple_lmk_decide_reclaim(int kswapd_priority) { if (kswapd_priority != CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION) return; if (!atomic_cmpxchg(&needs_reclaim, 0, 1)) wake_up(&oom_waitq); } void simple_lmk_stop_reclaim(void) { atomic_set(&needs_reclaim, 0); } void simple_lmk_mm_freed(struct mm_struct *mm) { static atomic_t nr_killed = ATOMIC_INIT(0); int i, nr_to_kill; nr_to_kill = READ_ONCE(victims_to_kill); for (i = 0; i < nr_to_kill; i++) { if (cmpxchg(&victims[i].mm, mm, NULL) == mm) { if (atomic_inc_return(&nr_killed) == nr_to_kill) { WRITE_ONCE(victims_to_kill, 0); nr_killed = (atomic_t)ATOMIC_INIT(0); complete(&reclaim_done); } break; } } } /* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */ static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) { static atomic_t init_done = ATOMIC_INIT(0); struct task_struct *thread; if (atomic_cmpxchg(&init_done, 0, 1)) return 0; thread = kthread_run_perf_critical(simple_lmk_reclaim_thread, NULL, "simple_lmkd"); BUG_ON(IS_ERR(thread)); return 0; } static const struct kernel_param_ops simple_lmk_init_ops = { .set = simple_lmk_init_set }; /* Needed to prevent Android from thinking there's no LMK and thus rebooting */ #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "lowmemorykiller." module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);