Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 1909

kernel-2.6.18-194.11.1.el5.src.rpm

From: Larry Woodman <lwoodman@redhat.com>
Date: Tue, 1 Dec 2009 21:16:53 -0500
Subject: [mm] SRAT and NUMA fixes for span and/or is discontig mem
Message-id: <1259702213.2345.23.camel@dhcp-100-19-198.bos.redhat.com>
Patchwork-id: 21622
O-Subject: Re: [RHEL5.5 PATCH] SRAT not properly built and NUMA disabled when
	node memory range spans other nodes and/or is discontiguous.
Bugzilla: 474097
RH-Acked-by: Christopher Lalancette <clalance@redhat.com>
RH-Acked-by: Rik van Riel <riel@redhat.com>

Some Intel systems have NUMA nodes with memory ranges that are
discontiguous and/or span other nodes.  Even though a pfn is valid and
between a node's start and end pfns, it may not reside on that node.

For example, If the physical address layout on a two node system with 16
GB memory is something like:

    node 0: 0-4GB, 8-12GB
    node 1: 4-8GB, 12-16GB

Currently the RHEL5 kernel fails to detect this NUMA topology and disables
NUMA altogether.

With that 16 GB memory configuration:

On node 0, there are 2 memory ranges, 0-100000000 and 250000000-350000000.  But
from RHEL 5.4 dmesg output, the second memory range is shown up starting from
base address 0, instead of 250000000:

-----------------------------------------------------------------------------
SRAT: Node 0 PXM 0 0-100000000
SRAT: Node 0 PXM 0 0-350000000
SRAT: Node 1 PXM 1 250000000-450000000
Your memory is not aligned you need to rebuild your kernel with a
bigger NODEMAPSIZE shift=20
SRAT: No NUMA node hash function found. Contact maintainer
SRAT: SRAT not used.
No NUMA configuration found
Faking a node at 0000000000000000-0000000640000000 Bootmem setup node
0 0000000000000000-0000000640000000
-----------------------------------------------------------------------------

With the upstream backport patch applied the SRAT table is correct and NUMA
is enabled:
-----------------------------------------------------------------------------
SRAT: Node 0 PXM 0 0-100000000
SRAT: Node 0 PXM 0 250000000-350000000
SRAT: Node 1 PXM 1 100000000-250000000
SRAT: Node 1 PXM 1 350000000-450000000

The attached upstream backport patch fixes this problem and BZ474097

Signed-off-by: Don Zickus <dzickus@redhat.com>

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 383172c..dc98732 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -360,6 +360,15 @@ config X86_64_ACPI_NUMA
        help
 	 Enable ACPI SRAT based node topology detection.
 
+# Some NUMA nodes have memory ranges that span
+# other nodes.  Even though a pfn is valid and
+# between a node's start and end pfns, it may not
+# reside on that node.  See memmap_init_zone()
+# for details.
+config NODES_SPAN_OTHER_NODES
+	def_bool y
+	depends on X86_64_ACPI_NUMA
+
 config NUMA_EMU
 	bool "NUMA emulation"
 	depends on NUMA
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 7c45c2d..d32e4e6 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -155,7 +155,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	if (!found)
 		return -1; 
 
-	memnode_shift = compute_hash_shift(nodes, 8);
+	memnode_shift = compute_hash_shift(nodes, 8, NULL);
 	if (memnode_shift < 0) { 
 		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
 		return -1; 
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index b2fac14..bfbfe25 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -46,7 +46,8 @@ int numa_off __initdata;
  * -1 if node overlap or lost ram (shift too big)
  */
 static int __init
-populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
+populate_memnodemap(const struct bootnode *nodes,
+		     int numnodes, int shift, int *nodeids)
 {
 	int i; 
 	int res = -1;
@@ -65,7 +66,11 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
 		do {
 			if (memnodemap[addr >> shift] != 0xff)
 				return -1;
-			memnodemap[addr >> shift] = i;
+			if (!nodeids)
+				memnodemap[addr >> shift] = i;
+			else
+				memnodemap[addr >> shift] = nodeids[i];
+
                        addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
@@ -73,17 +78,18 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
 	return res;
 }
 
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
+			      int *nodeids)
 {
 	int shift = 20;
 
-	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+	while (populate_memnodemap(nodes, numnodes, shift + 1, nodeids) >= 0)
 		shift++;
 
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
-	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
 		printk(KERN_INFO
 	"Your memory is not aligned you need to rebuild your kernel "
 	"with a bigger NODEMAPSIZE shift=%d\n",
@@ -253,7 +259,7 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  		       (nodes[i].end - nodes[i].start) >> 20);
 		node_set_online(i);
  	}
- 	memnode_shift = compute_hash_shift(nodes, numa_fake);
+ 	memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
  	if (memnode_shift < 0) {
  		memnode_shift = 0;
  		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 371241a..2a5ac61 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -29,6 +29,10 @@ static struct bootnode nodes_add[MAX_NUMNODES];
 static int found_add_area __initdata;
 int hotadd_percent __initdata = 0;
 
+static int num_node_memblks __initdata;
+static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
+static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
+
 /* Too small nodes confuse the VM badly. Usually they result
    from BIOS bugs. */
 #define NODE_MIN_SIZE (4*1024*1024)
@@ -38,17 +42,17 @@ static __init int setup_node(int pxm)
 	return acpi_map_pxm_to_node(pxm);
 }
 
-static __init int conflicting_nodes(unsigned long start, unsigned long end)
+static __init int conflicting_memblks(unsigned long start, unsigned long end)
 {
 	int i;
-	for_each_node_mask(i, nodes_parsed) {
-		struct bootnode *nd = &nodes[i];
+	for (i = 0; i < num_node_memblks; i++) {
+		struct bootnode *nd = &node_memblk_range[i];
 		if (nd->start == nd->end)
 			continue;
 		if (nd->end > start && nd->start < end)
-			return i;
+			return memblk_nodeid[i];
 		if (nd->end == end && nd->start == start)
-			return i;
+			return memblk_nodeid[i];
 	}
 	return -1;
 }
@@ -273,7 +277,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		bad_srat();
 		return;
 	}
-	i = conflicting_nodes(start, end);
+	i = conflicting_memblks(start, end);
 	if (i == node) {
 		printk(KERN_WARNING
 		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
@@ -298,8 +302,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 			nd->end = end;
 	}
 
-	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
-	       nd->start, nd->end);
+	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+	       start, end);
 
  	if (ma->flags.hot_pluggable && (reserve_hotadd(node, start, end) < 0)) {
 		/* Ignore hotadd region. Undo damage */
@@ -308,6 +312,11 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		if ((nd->start | nd->end) == 0)
 			node_clear(node, nodes_parsed);
 	}
+
+	node_memblk_range[num_node_memblks].start = start;
+	node_memblk_range[num_node_memblks].end = end;
+	memblk_nodeid[num_node_memblks] = node;
+	num_node_memblks++;
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -373,7 +382,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 	}
 
-	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
+					   memblk_nodeid);
 	if (memnode_shift < 0) {
 		printk(KERN_ERR
 		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index 933ff11..91caf37 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -7,7 +7,8 @@ struct bootnode {
 	u64 start,end; 
 };
 
-extern int compute_hash_shift(struct bootnode *nodes, int numnodes);
+extern int compute_hash_shift(struct bootnode *nodes, int numblks,
+			      int *nodeids);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))