Sophie: kernel-2.6.18-194.11.1.el5 src

kernel-2.6.18-194.11.1.el5.src.rpm

From: Larry Woodman <lwoodman@redhat.com>
Date: Thu, 18 Sep 2008 13:40:13 -0400
Subject: Revert: [mm] NUMA: system is slow when over-committing memory
Message-id: 1221759613.3213.6.camel@dhcp-100-19-198.bos.redhat.com
O-Subject: Re: [RHEL5-U3 patch] Prevent 100% cpu time in RHEL5 kernel under NUMA when zone_reclaim_mode=1
Bugzilla: 457264

On Wed, 2008-07-30 at 12:31 -0400, Larry Woodman wrote:
> We received a report from Intel(Arjan) about the RHEL5 kernel running
> 1000 times slower than the upstream kernel for several seconds or even
> minutes when over-committing the memory on one node of a multi-core NUMA
> system.  This only happens when zone_reclaim_mode gets set to 1 by
> build_zone_lists when it determine another node is sufficiently
> "far-away" and it is better to reclaim pages in a zone before going off
> node.  I verified this and determined that the upstream kernel does not
> let more than one core/cpu in __reclaim_zone() at a time.  Without this
> change we can have multiple cores performing direct reclaim on the same
> zone at the same time and this causes lots of
> spin_lock_irq(&zone->lru_lock) contention.
>
> The attached patch add that logic to RHEL4 by overloading the
> zone->all_unreclaimable field similar to the way the zone->flags is done
> upstream.  While this is a bit hacky it preserves the kABI.
>
> Fixes BZ 457264.

I am requesting that we remove this patch from RHEL5-U3 ASAP(before
beta).  First of all this patch was an upstream backport requested from
Intel along with several others and many were NAK'd.  The problem with
this patch is that we have seen OOM kills running the RHTS workloads
under the PPC64 and z390 architectures.  The cause of the OOM kills was
determined to be the re-use of the zone->all_unreclaimable integer for
flags and the use of the bit-op macros on those flags.  The test_bit(),
clear_bit() and tes_and_set_bit() macros expect unsigned longs while
all_unreclaimable is and int.  This causes failures on big endian
architectures.  The second problem is we could never reproduce this
problem internally even on large Intel x86_64 systems.  The patch closed
a small window where multiple CPUs could end up in __zone_reclaim() at
the same time reclaiming form the same zone resulting in a performance
degradation.  At this point I dont think this patch is safe enough for
RHEL5-U3.

Larry Woodman

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d36cdb6..deb05bf 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -263,10 +263,6 @@ struct zone {
 	char			*name;
 } ____cacheline_internodealigned_in_smp;
 
-enum {
-        ZONE_ALL_UNRECLAIMABLE,         /* all pages pinned */
-        ZONE_RECLAIM_LOCKED,            /* prevents concurrent reclaim */
-};
 
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7385c72..3eed821 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -423,7 +423,7 @@ static void free_pages_bulk(struct zone *zone, int count,
 					struct list_head *list, int order)
 {
 	spin_lock(&zone->lock);
-	clear_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable);
+	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	while (count--) {
 		struct page *page;
@@ -1344,8 +1344,7 @@ void show_free_areas(void)
 			K(zone->nr_inactive),
 			K(zone->present_pages),
 			zone->pages_scanned,
-			(test_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable) 
-				  ? "yes" : "no")
+			(zone->all_unreclaimable ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
@@ -1374,7 +1373,7 @@ void show_free_areas(void)
 		printk("= %lukB\n", K(total));
 	}
 
-	printk("%ld pagecache pages\n", global_page_state(NR_FILE_PAGES));
+	printk("%d pagecache pages\n", global_page_state(NR_FILE_PAGES));
 
 	show_swap_cache_info();
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3740ad1..ea3b83d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -966,8 +966,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 
 		note_zone_scanning_priority(zone, priority);
 
-		if (test_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable) 
-		    && priority != DEF_PRIORITY)
+		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
 		sc->all_unreclaimable = 0;
@@ -1148,8 +1147,7 @@ loop_again:
 			if (!populated_zone(zone))
 				continue;
 
-			if (test_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable) 
-			    && priority != DEF_PRIORITY)
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1182,8 +1180,7 @@ scan:
 			if (!populated_zone(zone))
 				continue;
 
-			if (test_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable) 
-			    && priority != DEF_PRIORITY)
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1198,11 +1195,11 @@ scan:
 						lru_pages);
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
-			if (test_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable))
+			if (zone->all_unreclaimable)
 				continue;
 			if (nr_slab == 0 && zone->pages_scanned >=
 				    (zone->nr_active + zone->nr_inactive) * 6)
-				set_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable);
+				zone->all_unreclaimable = 1;
 			/*
 			 * If we've done a decent amount of scanning and
 			 * the reclaim ratio is low, start doing writepage
@@ -1359,8 +1356,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
 		if (!populated_zone(zone))
 			continue;
 
-		if (test_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable) 
-		    && prio != DEF_PRIORITY)
+		if (zone->all_unreclaimable && prio != DEF_PRIORITY)
 			continue;
 
 		/* For pass = 0 we don't shrink the active list */
@@ -1658,7 +1654,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	cpumask_t mask;
 	int node_id;
-	int ret;
 
 	/*
 	 * Zone reclaim reclaims unmapped file backed pages and
@@ -1682,25 +1677,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * then do not scan.
 	 */
 	if (!(gfp_mask & __GFP_WAIT) ||
-		test_bit(ZONE_ALL_UNRECLAIMABLE, (unsigned long *)&zone->all_unreclaimable) ||
+		zone->all_unreclaimable ||
 		atomic_read(&zone->reclaim_in_progress) > 0 ||
 		(current->flags & PF_MEMALLOC))
 			return 0;
 
 	/*
 	 * Only run zone reclaim on the local zone or on zones that do not
-	 * have associated processors and only allow one reclaim at a time. 
-	 * This will favor the local processor over remote processors and 
-	 * spread off node memory allocations as wide as possible.
+	 * have associated processors. This will favor the local processor
+	 * over remote processors and spread off node memory allocations
+	 * as wide as possible.
 	 */
 	node_id = zone->zone_pgdat->node_id;
 	mask = node_to_cpumask(node_id);
 	if (!cpus_empty(mask) && node_id != numa_node_id())
 		return 0;
-	if (test_and_set_bit(ZONE_RECLAIM_LOCKED, (unsigned long *)&zone->all_unreclaimable))
-		return 0;
-	ret = __zone_reclaim(zone, gfp_mask, order);
-	clear_bit(ZONE_RECLAIM_LOCKED, (unsigned long *)&zone->all_unreclaimable);
-	return ret;
+	return __zone_reclaim(zone, gfp_mask, order);
 }
 #endif