Date: Thu, 05 Oct 2006 13:37:16 -0400 From: Larry Woodman <lwoodman@redhat.com> Subject: Re: RHEL5 and OOM kills Dave, this patch includes the upstream OOM kill changes so that RHEL5 is in sync with the latest 2.6.19 kernel, as well as the out_of_memory() change so that it must be called more than 10 times within a 5 second window before it actually kills a process. I think this gives us the best of everything, we have all the upstream code plus one small change that gets us to pass the RHTS test suite. --- linux-2.6.18.noarch/mm/oom_kill.c.larry +++ linux-2.6.18.noarch/mm/oom_kill.c @@ -58,6 +58,12 @@ unsigned long badness(struct task_struct } /* + * swapoff can easily use up all memory, so kill those first. + */ + if (p->flags & PF_SWAPOFF) + return ULONG_MAX; + + /* * The memory size of the process is the basis for the badness. */ points = mm->total_vm; @@ -127,6 +133,14 @@ unsigned long badness(struct task_struct points /= 4; /* + * If p's nodes don't overlap ours, it may still help to kill p + * because p may have allocated or otherwise mapped memory on + * this node before. However it will be less likely. + */ + if (!cpuset_excl_nodes_overlap(p)) + points /= 8; + + /* * Adjust the score by oomkilladj. */ if (p->oomkilladj) { @@ -191,25 +205,38 @@ static struct task_struct *select_bad_pr unsigned long points; int releasing; + /* skip kernel threads */ + if (!p->mm) + continue; + /* skip the init task with pid == 1 */ if (p->pid == 1) continue; - if (p->oomkilladj == OOM_DISABLE) - continue; - /* If p's nodes don't overlap ours, it won't help to kill p. */ - if (!cpuset_excl_nodes_overlap(p)) - continue; - /* * This is in the process of releasing memory so wait for it * to finish before killing some other task by mistake. + * + * However, if p is the current task, we allow the 'kill' to + * go ahead if it is exiting: this will simply set TIF_MEMDIE, + * which will allow it to gain access to memory reserves in + * the process of exiting and releasing its resources. + * Otherwise we could get an OOM deadlock. */ releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || p->flags & PF_EXITING; - if (releasing && !(p->flags & PF_DEAD)) + if (releasing) { + /* PF_DEAD tasks have already released their mm */ + if (p->flags & PF_DEAD) + continue; + if (p->flags & PF_EXITING && p == current) { + chosen = p; + *ppoints = ULONG_MAX; + break; + } return ERR_PTR(-1UL); - if (p->flags & PF_SWAPOFF) - return p; + } + if (p->oomkilladj == OOM_DISABLE) + continue; points = badness(p, uptime.tv_sec); if (points > *ppoints || !chosen) { @@ -241,7 +268,8 @@ static void __oom_kill_task(struct task_ return; } task_unlock(p); - printk(KERN_ERR "%s: Killed process %d (%s).\n", + if (message) + printk(KERN_ERR "%s: Killed process %d (%s).\n", message, p->pid, p->comm); /* @@ -293,8 +321,15 @@ static int oom_kill_process(struct task_ struct task_struct *c; struct list_head *tsk; - printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " - "children.\n", p->pid, p->comm, points); + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just set TIF_MEMDIE so it can die quickly + */ + if (p->flags & PF_EXITING) { + __oom_kill_task(p, NULL); + return 0; + } + /* Try to kill a child first */ list_for_each(tsk, &p->children) { c = list_entry(tsk, struct task_struct, sibling); @@ -306,6 +341,69 @@ static int oom_kill_process(struct task_ return oom_kill_task(p, message); } +int should_oom_kill(void) +{ + static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED; + static unsigned long first, last, count, lastkill; + unsigned long now, since; + int ret = 0; + + spin_lock(&oom_lock); + now = jiffies; + since = now - last; + last = now; + + /* + * If it's been a long time since last failure, + * we're not oom. + */ + if (since > 5*HZ) + goto reset; + + /* + * If we haven't tried for at least one second, + * we're not really oom. + */ + since = now - first; + if (since < HZ) + goto out_unlock; + + /* + * If we have gotten only a few failures, + * we're not really oom. + */ + if (++count < 10) + goto out_unlock; + + /* + * If we just killed a process, wait a while + * to give that task a chance to exit. This + * avoids killing multiple processes needlessly. + */ + since = now - lastkill; + if (since < HZ*5) + goto out_unlock; + + /* + * Ok, really out of memory. Kill something. + */ + lastkill = now; + ret = 1; + +reset: +/* + * We dropped the lock above, so check to be sure the variable + * first only ever increases to prevent false OOM's. + */ + if (time_after(now, first)) + first = now; + count = 0; + +out_unlock: + spin_unlock(&oom_lock); + return ret; +} + /** * out_of_memory - kill the "best" process when we run out of memory * @@ -320,12 +418,16 @@ void out_of_memory(struct zonelist *zone unsigned long points = 0; if (printk_ratelimit()) { - printk("oom-killer: gfp_mask=0x%x, order=%d\n", - gfp_mask, order); + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); dump_stack(); show_mem(); } + if (!should_oom_kill()) + return; + cpuset_lock(); read_lock(&tasklist_lock); --- linux-2.6.18.noarch/mm/vmscan.c.larry +++ linux-2.6.18.noarch/mm/vmscan.c @@ -62,6 +62,8 @@ struct scan_control { int swap_cluster_max; int swappiness; + + int all_unreclaimable; }; /* @@ -695,6 +697,11 @@ done: return nr_reclaimed; } +static inline int zone_is_near_oom(struct zone *zone) +{ + return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; +} + /* * This moves pages from the active list to the inactive list. * @@ -730,6 +737,9 @@ static void shrink_active_list(unsigned long distress; long swap_tendency; + if (zone_is_near_oom(zone)) + goto force_reclaim_mapped; + /* * `distress' is a measure of how much trouble we're having * reclaiming pages. 0 -> no problems. 100 -> great trouble. @@ -765,6 +775,7 @@ static void shrink_active_list(unsigned * memory onto the inactive list. */ if (swap_tendency >= 100) +force_reclaim_mapped: reclaim_mapped = 1; } @@ -925,6 +936,7 @@ static unsigned long shrink_zones(int pr unsigned long nr_reclaimed = 0; int i; + sc->all_unreclaimable = 1; for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; @@ -941,6 +953,8 @@ static unsigned long shrink_zones(int pr if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + sc->all_unreclaimable = 0; + nr_reclaimed += shrink_zone(priority, zone, sc); } return nr_reclaimed; @@ -1021,6 +1035,10 @@ unsigned long try_to_free_pages(struct z if (sc.nr_scanned && priority < DEF_PRIORITY - 2) blk_congestion_wait(WRITE, HZ/10); } + /* top priority shrink_caches still had more to do? don't OOM, then */ + if (!sc.all_unreclaimable || nr_reclaimed) + ret = 1; + out: for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; @@ -1153,7 +1171,7 @@ scan: if (zone->all_unreclaimable) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone->nr_active + zone->nr_inactive) * 4) + (zone->nr_active + zone->nr_inactive) * 6) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and