Sophie: kernel-2.6.18-238.el5 src

kernel-2.6.18-238.el5.src.rpm

From: Rik van Riel <riel@redhat.com>
Date: Thu, 16 Apr 2009 11:04:30 -0400
Subject: [mm] vmscan: bail out of direct reclaim after max pages
Message-id: 20090416110430.00123c17@bree.surriel.com
O-Subject: [RHEL 5.4 PATCH] vmscan: bail out of direct reclaim after swap_cluster_max pages (bz 495442)
Bugzilla: 495442
RH-Acked-by: Larry Woodman <lwoodman@redhat.com>
RH-Acked-by: Josef Bacik <josef@redhat.com>

    vmscan: bail out of direct reclaim after swap_cluster_max pages

    When the VM is under pressure, it can happen that several direct reclaim
    processes are in the pageout code simultaneously.  It also happens that
    the reclaiming processes run into mostly referenced, mapped and dirty
    pages in the first round.

    This results in multiple direct reclaim processes having a lower
    pageout priority, which corresponds to a higher target of pages to
    scan.

    This in turn can result in each direct reclaim process freeing
    many pages.  Together, they can end up freeing way too many pages.

    This kicks useful data out of memory (in some cases more than half
    of all memory is swapped out).  It also impacts performance by
    keeping tasks stuck in the pageout code for too long.

Combination of the following upstream changesets:

a79311c14eae4bb946a97af25f3e1b17d625985d
    Signed-off-by: Rik van Riel <riel@redhat.com>
    Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

01dbe5c9b1004dab045cb7f38428258ca9cddc02
    Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
    Acked-by: Rik van Riel <riel@redhat.com>
    Cc: Mel Gorman <mel@csn.ul.ie>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Impact on order 0 direct reclaim: direct reclaim tasks will break
out of the direct reclaim loop in shrink_zone after they have freed
sc->swap_cluster max pages.  Their page allocation will succeed and
they will be out of page reclaim with minimal latency.

Impact on order >= 1 direct reclaim: the RHEL 5 VM is not aware of
order >= 1 in the direct reclaim path and does not have lumpy reclaim.
If, after returning from direct reclaim, the allocation still fails,
the code in __alloc_pages will continue to reclaim pages as long as
page reclaim is successful.

Impact on kswapd: none.  The code below will not break out of the
reclaim loop if current_is_kswapd().

Test results: my desktop system at home, which has a 2GB dom0 and
3 Xen guests, would regularly get into swap storms, where multiple
processes ended up swapping out lots of memory each.  It would be
fairly common for the system to be stuck in the VM for over a minute
and end up with over 1GB of free memory in dom0 after such a swap
storm.

With the patch applied, I managed to get kswapd in a swap-crazy
mood a few times, with kswapd swapping out over 100MB in one go.
However, the rest of the processes in the system did not join
kswapd in the fray and I experienced no stalls at all.

Fixes bug 495442

Please review and ACK.

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d374f6c..1f13601 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -47,6 +47,9 @@ struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
+	/* Number of pages freed so far during a call to shrink_zones() */
+	unsigned long nr_reclaimed;
+
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 
@@ -877,13 +880,14 @@ force_reclaim_mapped:
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static unsigned long shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(int priority, struct zone *zone,
 				struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
-	unsigned long nr_reclaimed = 0;
+	unsigned long nr_reclaimed = sc->nr_reclaimed;
+	unsigned long swap_cluster_max = sc->swap_cluster_max;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -893,14 +897,14 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 	 */
 	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
 	nr_active = zone->nr_scan_active;
-	if (nr_active >= sc->swap_cluster_max)
+	if (nr_active >= swap_cluster_max)
 		zone->nr_scan_active = 0;
 	else
 		nr_active = 0;
 
 	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
 	nr_inactive = zone->nr_scan_inactive;
-	if (nr_inactive >= sc->swap_cluster_max)
+	if (nr_inactive >= swap_cluster_max)
 		zone->nr_scan_inactive = 0;
 	else
 		nr_inactive = 0;
@@ -908,28 +912,40 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 	while (nr_active || nr_inactive) {
 		if (test_thread_flag(TIF_MEMDIE)) {
 			atomic_dec(&zone->reclaim_in_progress);
-			return 0;
+			return;
 		}
 		if (nr_active) {
 			nr_to_scan = min(nr_active,
-					(unsigned long)sc->swap_cluster_max);
+					(unsigned long)swap_cluster_max);
 			nr_active -= nr_to_scan;
 			shrink_active_list(nr_to_scan, zone, sc, priority);
 		}
 
 		if (nr_inactive) {
 			nr_to_scan = min(nr_inactive,
-					(unsigned long)sc->swap_cluster_max);
+					(unsigned long)swap_cluster_max);
 			nr_inactive -= nr_to_scan;
-			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
-								sc);
+			nr_reclaimed += shrink_inactive_list(nr_to_scan,
+						zone, sc);
 		}
+		/*
+		 * On large memory systems, scan >> priority can become
+		 * really large. This is fine for the starting priority;
+		 * we want to put equal scanning pressure on each zone.
+		 * However, if the VM has a harder time of freeing pages,
+		 * with multiple processes reclaiming pages, the total
+		 * freeing target can get unreasonably large.
+		 */
+		if (nr_reclaimed > swap_cluster_max &&
+			priority < DEF_PRIORITY && !current_is_kswapd())
+			break;
 	}
 
+	sc->nr_reclaimed = nr_reclaimed;
+
 	throttle_vm_writeout();
 
 	atomic_dec(&zone->reclaim_in_progress);
-	return nr_reclaimed;
 }
 
 /*
@@ -943,15 +959,12 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
  * b) The zones may be over pages_high but they must go *over* pages_high to
  *    satisfy the `incremental min' zone defense algorithm.
  *
- * Returns the number of reclaimed pages.
- *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static unsigned long shrink_zones(int priority, struct zone **zones,
+static void shrink_zones(int priority, struct zone **zones,
 					struct scan_control *sc)
 {
-	unsigned long nr_reclaimed = 0;
 	int i;
 
 	sc->all_unreclaimable = 1;
@@ -959,7 +972,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 		struct zone *zone = zones[i];
 
 		if (test_thread_flag(TIF_MEMDIE))
-			return 0;
+			return;
 		if (!populated_zone(zone))
 			continue;
 
@@ -973,9 +986,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 
 		sc->all_unreclaimable = 0;
 
-		nr_reclaimed += shrink_zone(priority, zone, sc);
+		shrink_zone(priority, zone, sc);
 	}
-	return nr_reclaimed;
 }
  
 /*
@@ -996,7 +1008,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	int priority;
 	int ret = 0;
 	unsigned long total_scanned = 0;
-	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	int i;
@@ -1025,14 +1036,14 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
-		nr_reclaimed += shrink_zones(priority, zones, &sc);
+		shrink_zones(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
-			nr_reclaimed += reclaim_state->reclaimed_slab;
+			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
 		total_scanned += sc.nr_scanned;
-		if (nr_reclaimed >= sc.swap_cluster_max) {
+		if (sc.nr_reclaimed >= sc.swap_cluster_max) {
 			ret = 1;
 			goto out;
 		}
@@ -1055,7 +1066,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
 	/* top priority shrink_caches still had more to do? don't OOM, then */
-	if (!sc.all_unreclaimable || nr_reclaimed)
+	if (!sc.all_unreclaimable || sc.nr_reclaimed)
 		ret = 1;
 
 out:
@@ -1106,7 +1117,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 	int priority;
 	int i;
 	unsigned long total_scanned;
-	unsigned long nr_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -1122,7 +1132,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 
 loop_again:
 	total_scanned = 0;
-	nr_reclaimed = 0;
+	sc.nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 
@@ -1191,11 +1201,11 @@ scan:
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
 			note_zone_scanning_priority(zone, priority);
-			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
-			nr_reclaimed += reclaim_state->reclaimed_slab;
+			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
@@ -1208,7 +1218,7 @@ scan:
 			 * even in laptop mode
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
+			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
 		if (all_zones_ok)
@@ -1226,7 +1236,7 @@ scan:
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
-		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+		if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
 			break;
 	}
 out:
@@ -1245,7 +1255,7 @@ out:
 		goto loop_again;
 	}
 
-	return nr_reclaimed;
+	return sc.nr_reclaimed;
 }
 
 /*
@@ -1592,7 +1602,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
-	unsigned long nr_reclaimed = 0;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -1623,9 +1632,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		priority = ZONE_RECLAIM_PRIORITY;
 		do {
 			note_zone_scanning_priority(zone, priority);
-			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			shrink_zone(priority, zone, &sc);
 			priority--;
-		} while (priority >= 0 && nr_reclaimed < nr_pages);
+		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
 	}
 
 	if (zone_page_state(zone, NR_SLAB) > zone->min_slab_pages) {
@@ -1649,7 +1658,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-	return nr_reclaimed >= nr_pages;
+	return sc.nr_reclaimed >= nr_pages;
 }
 
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)