Skip to content

Commit ad4e549

Browse files
committed
staging: android: lowmemorykiller: implement task's adj rbtree
Based on the current LMK implementation, LMK has to scan all processes to select the correct task to kill during low memory. The basic idea for the optimization is to : queue all tasks with oom_score_adj priority, and then LMK just selects the proper task from the queue(rbtree) to kill. performance improvement: the current implementation: average time to find a task to kill : 1004us the optimized implementation: average time to find a task to kill: 43us squished another 3 commits: staging: android: lowmemorykiller: select a new task to kill Under certain circumstances, a process may take time to handle a SIGKILL. When lowmemkiller is called again shortly after, it would pick the same process to kill over and over, so that we cann't get free memory for long time. Solution is to check fatal_signal_pending() on the selected task, and if it's already pending, select a new task to kill. staging: android: lmk: skip if killed by lmk A task can be selected to kill when it's already exiting. Sending signal to the task at that state won't set TIF_SIGPENDING for it. After that, every task including threads in the exiting task itself running into lmk will compete for the mutex, then spend some time waiting for the exiting task to release memory or try to send signal to the same task again. We want the exiting task to exit as early as possible but it's slowed down greatly by the mutex and waiting. Skip lmk if current thread group leader is exiting and is killed by lmk. Also delegate TIF_MEMDIE to current task. staging: android: lmk: check TIF directly When trying to fix that lmk can't kill an exiting task (commit 895304e (CR) staging: android: lmk: skip if killed by lmk), additional check was added for the TIF_MEMDIE flag. However the task lock has already been taken when examining the task. The same helper function which tries to grab the task lock again will be deadlocked. Change to check the thread flag of thread group leader directly. Change-Id: Ibbb99cbd36e27a546195b4d4bed59b3829e6b7b0
1 parent 02972a4 commit ad4e549

File tree

6 files changed

+134
-22
lines changed

6 files changed

+134
-22
lines changed

drivers/staging/android/lowmemorykiller.c

Lines changed: 123 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
/* drivers/misc/lowmemorykiller.c
22
*
33
* The lowmemorykiller driver lets user-space specify a set of memory thresholds
4-
* where processes with a range of oom_score_adj values will get killed. Specify
5-
* the minimum oom_score_adj values in
4+
* where processes with a range of oom_adj values will get killed. Specify
5+
* the minimum oom_adj values in
66
* /sys/module/lowmemorykiller/parameters/adj and the number of free pages in
77
* /sys/module/lowmemorykiller/parameters/minfree. Both files take a comma
88
* separated list of numbers in ascending order.
99
*
1010
* For example, write "0,8" to /sys/module/lowmemorykiller/parameters/adj and
1111
* "1024,4096" to /sys/module/lowmemorykiller/parameters/minfree to kill
12-
* processes with a oom_score_adj value of 8 or higher when the free memory
13-
* drops below 4096 pages and kill processes with a oom_score_adj value of 0 or
12+
* processes with a oom_adj value of 8 or higher when the free memory
13+
* drops below 4096 pages and kill processes with a oom_adj value of 0 or
1414
* higher when the free memory drops below 1024 pages.
1515
*
1616
* The driver considers memory used for caches to be free, but if a large
@@ -56,6 +56,7 @@ static int lowmem_adj[6] = {
5656
1,
5757
6,
5858
12,
59+
15,
5960
};
6061
static int lowmem_adj_size = 4;
6162
static int lowmem_minfree[6] = {
@@ -149,6 +150,10 @@ void tune_lmk_zone_param(struct zonelist *zonelist, int classzone_idx,
149150
}
150151
}
151152

153+
static struct task_struct *pick_next_from_adj_tree(struct task_struct *task);
154+
static struct task_struct *pick_first_task(void);
155+
static struct task_struct *pick_last_task(void);
156+
152157
void tune_lmk_param(int *other_free, int *other_file, gfp_t gfp_mask)
153158
{
154159
struct zone *preferred_zone;
@@ -202,13 +207,19 @@ static int lowmem_shrink(struct shrinker *s, int nr_to_scan, gfp_t gfp_mask)
202207
int rem = 0;
203208
int tasksize;
204209
int i;
205-
int min_score_adj = OOM_SCORE_ADJ_MAX + 1;
210+
int min_adj = OOM_SCORE_ADJ_MAX + 1;
206211
int selected_tasksize = 0;
207-
int selected_oom_score_adj;
212+
int selected_oom_adj;
208213
int array_size = ARRAY_SIZE(lowmem_adj);
209214
int other_free;
210215
int other_file;
211-
216+
217+
tsk = current->group_leader;
218+
if ((tsk->flags & PF_EXITING) && test_task_flag(tsk, TIF_MEMDIE)) {
219+
set_tsk_thread_flag(current, TIF_MEMDIE);
220+
return 0;
221+
}
222+
212223
if (nr_to_scan > 0) {
213224
if (mutex_lock_interruptible(&scan_mutex) < 0)
214225
return 0;
@@ -227,19 +238,19 @@ static int lowmem_shrink(struct shrinker *s, int nr_to_scan, gfp_t gfp_mask)
227238
for (i = 0; i < array_size; i++) {
228239
if (other_free < lowmem_minfree[i] &&
229240
other_file < lowmem_minfree[i]) {
230-
min_score_adj = lowmem_adj[i];
241+
min_adj = lowmem_adj[i];
231242
break;
232243
}
233244
}
234245
if (nr_to_scan > 0)
235246
lowmem_print(3, "lowmem_shrink %d, %x, ofree %d %d, ma %d\n",
236247
nr_to_scan, gfp_mask, other_free,
237-
other_file, min_score_adj);
248+
other_file, min_adj);
238249
rem = global_page_state(NR_ACTIVE_ANON) +
239250
global_page_state(NR_ACTIVE_FILE) +
240251
global_page_state(NR_INACTIVE_ANON) +
241252
global_page_state(NR_INACTIVE_FILE);
242-
if (nr_to_scan <= 0 || min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
253+
if (nr_to_scan <= 0 || min_adj == OOM_SCORE_ADJ_MAX + 1) {
243254
lowmem_print(5, "lowmem_shrink %d, %x, return %d\n",
244255
nr_to_scan, gfp_mask, rem);
245256

@@ -248,12 +259,14 @@ static int lowmem_shrink(struct shrinker *s, int nr_to_scan, gfp_t gfp_mask)
248259

249260
return rem;
250261
}
251-
selected_oom_score_adj = min_score_adj;
262+
selected_oom_adj = min_adj;
252263

253264
rcu_read_lock();
254-
for_each_process(tsk) {
265+
for (tsk = pick_first_task();
266+
tsk != pick_last_task();
267+
tsk = pick_next_from_adj_tree(tsk)) {
255268
struct task_struct *p;
256-
int oom_score_adj;
269+
int oom_adj;
257270

258271
if (tsk->flags & PF_KTHREAD)
259272
continue;
@@ -266,7 +279,10 @@ static int lowmem_shrink(struct shrinker *s, int nr_to_scan, gfp_t gfp_mask)
266279
if (test_task_flag(tsk, TIF_MEMDIE)) {
267280
rcu_read_unlock();
268281
/* give the system time to free up the memory */
269-
msleep_interruptible(20);
282+
if (!same_thread_group(current, tsk))
283+
msleep_interruptible(20);
284+
else
285+
set_tsk_thread_flag(current, TIF_MEMDIE);
270286
mutex_unlock(&scan_mutex);
271287
return 0;
272288
}
@@ -276,32 +292,42 @@ static int lowmem_shrink(struct shrinker *s, int nr_to_scan, gfp_t gfp_mask)
276292
if (!p)
277293
continue;
278294

279-
oom_score_adj = p->signal->oom_adj;
280-
if (oom_score_adj < min_score_adj) {
295+
oom_adj = p->signal->oom_adj;
296+
if (oom_adj < min_adj) {
297+
task_unlock(p);
298+
break;
299+
}
300+
301+
302+
if (fatal_signal_pending(p) ||
303+
((p->flags & PF_EXITING) &&
304+
test_tsk_thread_flag(p, TIF_MEMDIE))) {
305+
lowmem_print(2, "skip slow dying process %d\n", p->pid);
281306
task_unlock(p);
282307
continue;
283308
}
309+
284310
tasksize = get_mm_rss(p->mm);
285311
task_unlock(p);
286312
if (tasksize <= 0)
287313
continue;
288314
if (selected) {
289-
if (oom_score_adj < selected_oom_score_adj)
290-
continue;
291-
if (oom_score_adj == selected_oom_score_adj &&
315+
if (oom_adj < selected_oom_adj)
316+
break;
317+
if (oom_adj == selected_oom_adj &&
292318
tasksize <= selected_tasksize)
293319
continue;
294320
}
295321
selected = p;
296322
selected_tasksize = tasksize;
297-
selected_oom_score_adj = oom_score_adj;
323+
selected_oom_adj = oom_adj;
298324
lowmem_print(2, "select %d (%s), adj %d, size %d, to kill\n",
299-
p->pid, p->comm, oom_score_adj, tasksize);
325+
p->pid, p->comm, oom_adj, tasksize);
300326
}
301327
if (selected) {
302328
lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d\n",
303329
selected->pid, selected->comm,
304-
selected_oom_score_adj, selected_tasksize);
330+
selected_oom_adj, selected_tasksize);
305331
lowmem_deathpending_timeout = jiffies + HZ;
306332
send_sig(SIGKILL, selected, 0);
307333
set_tsk_thread_flag(selected, TIF_MEMDIE);
@@ -334,6 +360,81 @@ static void __exit lowmem_exit(void)
334360
unregister_shrinker(&lowmem_shrinker);
335361
}
336362

363+
DEFINE_SPINLOCK(lmk_lock);
364+
struct rb_root tasksadj = RB_ROOT;
365+
void add_2_adj_tree(struct task_struct *task)
366+
{ struct rb_node **link = &tasksadj.rb_node;
367+
struct rb_node *parent = NULL;
368+
struct task_struct *task_entry;
369+
s64 key = task->signal->oom_adj;
370+
/*
371+
* Find the right place in the rbtree:
372+
*/
373+
spin_lock(&lmk_lock);
374+
while (*link) {
375+
parent = *link;
376+
task_entry = rb_entry(parent, struct task_struct, adj_node);
377+
if (key < task_entry->signal->oom_adj)
378+
link = &parent->rb_right;
379+
else
380+
link = &parent->rb_left;
381+
}
382+
383+
rb_link_node(&task->adj_node, parent, link);
384+
rb_insert_color(&task->adj_node, &tasksadj);
385+
spin_unlock(&lmk_lock);
386+
}
387+
388+
void delete_from_adj_tree(struct task_struct *task)
389+
{
390+
spin_lock(&lmk_lock);
391+
rb_erase(&task->adj_node, &tasksadj);
392+
spin_unlock(&lmk_lock);
393+
}
394+
395+
396+
static struct task_struct *pick_next_from_adj_tree(struct task_struct *task)
397+
{
398+
struct rb_node *next;
399+
400+
spin_lock(&lmk_lock);
401+
next = rb_next(&task->adj_node);
402+
spin_unlock(&lmk_lock);
403+
404+
if (!next)
405+
return NULL;
406+
407+
return rb_entry(next, struct task_struct, adj_node);
408+
}
409+
410+
static struct task_struct *pick_first_task(void)
411+
{
412+
struct rb_node *left;
413+
414+
spin_lock(&lmk_lock);
415+
left = rb_first(&tasksadj);
416+
spin_unlock(&lmk_lock);
417+
418+
if (!left)
419+
return NULL;
420+
421+
return rb_entry(left, struct task_struct, adj_node);
422+
}
423+
424+
static struct task_struct *pick_last_task(void)
425+
{
426+
struct rb_node *right;
427+
428+
spin_lock(&lmk_lock);
429+
right = rb_last(&tasksadj);
430+
spin_unlock(&lmk_lock);
431+
432+
if (!right)
433+
return NULL;
434+
435+
return rb_entry(right, struct task_struct, adj_node);
436+
}
437+
337438
module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
338439
module_param_array_named(adj, lowmem_adj, int, &lowmem_adj_size,
339440
S_IRUGO | S_IWUSR);

fs/exec.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,8 @@ static int de_thread(struct task_struct *tsk)
849849
transfer_pid(leader, tsk, PIDTYPE_SID);
850850

851851
list_replace_rcu(&leader->tasks, &tsk->tasks);
852+
delete_from_adj_tree(leader);
853+
add_2_adj_tree(tsk);
852854
list_replace_init(&leader->sibling, &tsk->sibling);
853855

854856
tsk->group_leader = tsk;

fs/proc/base.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,6 +1050,9 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
10501050

10511051
unlock_task_sighand(task, &flags);
10521052
put_task_struct(task);
1053+
1054+
delete_from_adj_tree(task);
1055+
add_2_adj_tree(task);
10531056

10541057
return count;
10551058
}

include/linux/sched.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,6 +1224,7 @@ struct task_struct {
12241224
#endif
12251225

12261226
struct list_head tasks;
1227+
struct rb_node adj_node;
12271228
struct plist_node pushable_tasks;
12281229

12291230
struct mm_struct *mm, *active_mm;
@@ -1552,6 +1553,9 @@ static inline struct pid *task_tgid(struct task_struct *task)
15521553
return task->group_leader->pids[PIDTYPE_PID].pid;
15531554
}
15541555

1556+
extern void add_2_adj_tree(struct task_struct *task);
1557+
extern void delete_from_adj_tree(struct task_struct *task);
1558+
15551559
/*
15561560
* Without tasklist or rcu lock it is not safe to dereference
15571561
* the result of task_pgrp/task_session even if task == current,

kernel/exit.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
6767
detach_pid(p, PIDTYPE_SID);
6868

6969
list_del_rcu(&p->tasks);
70+
delete_from_adj_tree(p);
7071
list_del_init(&p->sibling);
7172
__get_cpu_var(process_counts)--;
7273
}

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,6 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
12981298
attach_pid(p, PIDTYPE_SID, task_session(current));
12991299
list_add_tail(&p->sibling, &p->real_parent->children);
13001300
list_add_tail_rcu(&p->tasks, &init_task.tasks);
1301+
add_2_adj_tree(p);
13011302
__get_cpu_var(process_counts)++;
13021303
}
13031304
attach_pid(p, PIDTYPE_PID, pid);

0 commit comments

Comments
 (0)