Sophie: kernel-2.6.18-128.1.10.el5 src

kernel-2.6.18-128.1.10.el5.src.rpm

From: Hans-Joachim Picht <hpicht@redhat.com>
Date: Tue, 20 May 2008 10:51:17 +0200
Subject: [s390x] CPU Node Affinity
Message-id: 20080520085117.GC16866@redhat.com
O-Subject: [RHEL5 U3 PATCH] FEAT: s390x Linux CPU Node Affinity
Bugzilla: 447379
RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com>

This is a late feature which IBM overlooked requesting in the past.

Description
============

Optimize the Linux scheduling according to the CPU node topology. Newer
hardware (System z10 Enterprise Class) supports an interface which can be used
to get information about the cpu topology of an LPAR. This can be used to
optimize the Linux scheduler which bases its decisions on which process gets
scheduled to which cpu on the cpu topology. This feature should increase cache
hits and therefore overall performance as well.

Bugzilla
=========

BZ 447379
https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=447379

Upstream status of the patch:
=============================

The patch is included upstream in git commit
dbd70fb499952d0ba282f0159dafacfc31d50313

Test status:
============

The patch has been tested.

Please ACK.

With best regards,

Hans

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 86bd7f6..8956569 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -3,6 +3,10 @@
 # see Documentation/kbuild/kconfig-language.txt.
 #
 
+config SCHED_MC
+	def_bool y
+	depends on SMP
+
 config MMU
 	bool
 	default y
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index e1cbac4..303d525 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -14,7 +14,7 @@ obj-y	+= $(if $(CONFIG_64BIT),reipl64.o,reipl.o)
 extra-y				+= head.o init_task.o vmlinux.lds
 
 obj-$(CONFIG_MODULES)		+= s390_ksyms.o module.o
-obj-$(CONFIG_SMP)		+= smp.o
+obj-$(CONFIG_SMP)		+= smp.o topology.o
 
 obj-$(CONFIG_AUDIT)		+= audit.o
 compat-obj-$(CONFIG_AUDIT)	+= compat_audit.o
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index b11747d..fe82c0f 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -37,6 +37,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/device.h>
 #include <linux/notifier.h>
+#include <linux/topology.h>
 
 #include <asm/ipl.h>
 #include <asm/uaccess.h>
@@ -766,6 +767,7 @@ setup_arch(char **cmdline_p)
         cpu_init();
         __cpu_logical_map[0] = S390_lowcore.cpu_data.cpu_addr;
 	smp_setup_cpu_possible_map();
+	s390_init_cpu_topology();
 
 	/*
 	 * Setup capabilities (ELF_HWCAP & ELF_PLATFORM).
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
new file mode 100644
index 0000000..9d18041
--- /dev/null
+++ b/arch/s390/kernel/topology.c
@@ -0,0 +1,286 @@
+/*
+ *  arch/s390/kernel/topology.c
+ *
+ *    Copyright IBM Corp. 2007
+ *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/bootmem.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <asm/delay.h>
+#include <asm/s390_ext.h>
+
+static DEFINE_MUTEX(smp_cpu_state_mutex);
+
+#define CPU_BITS 64
+
+struct tl_cpu {
+	unsigned char reserved[6];
+	unsigned short origin;
+	unsigned long mask[CPU_BITS / BITS_PER_LONG];
+};
+
+struct tl_container {
+	unsigned char reserved[8];
+};
+
+union tl_entry {
+	unsigned char nl;
+	struct tl_cpu cpu;
+	struct tl_container container;
+};
+
+#define NR_MAG 6
+
+struct tl_info {
+	unsigned char reserved0[2];
+	unsigned short length;
+	unsigned char mag[NR_MAG];
+	unsigned char reserved1;
+	unsigned char mnest;
+	unsigned char reserved2[4];
+	union tl_entry tle[0];
+};
+
+struct core_info {
+	struct core_info *next;
+	cpumask_t mask;
+};
+
+static void topology_work_fn(void *data);
+static struct tl_info *tl_info;
+static struct core_info core_info;
+static int machine_has_topology;
+static int machine_has_topology_irq;
+static struct timer_list topology_timer;
+static void set_topology_timer(void);
+static struct work_struct topology_work;
+
+cpumask_t cpu_coregroup_map(unsigned int cpu)
+{
+	struct core_info *core = &core_info;
+	cpumask_t mask;
+
+	cpus_clear(mask);
+	if (!machine_has_topology)
+		return cpu_present_map;
+	mutex_lock(&smp_cpu_state_mutex);
+	while (core) {
+		if (cpu_isset(cpu, core->mask)) {
+			mask = core->mask;
+			break;
+		}
+		core = core->next;
+	}
+	mutex_unlock(&smp_cpu_state_mutex);
+	if (cpus_empty(mask))
+		mask = cpumask_of_cpu(cpu);
+	return mask;
+}
+
+static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
+{
+	unsigned int cpu;
+
+	for (cpu = find_first_bit(&tl_cpu->mask[0], CPU_BITS);
+	     cpu < CPU_BITS;
+	     cpu = find_next_bit(&tl_cpu->mask[0], CPU_BITS, cpu + 1))
+	{
+		unsigned int rcpu, lcpu;
+
+		rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
+		for_each_present_cpu(lcpu) {
+			if (__cpu_logical_map[lcpu] == rcpu)
+				cpu_set(lcpu, core->mask);
+		}
+	}
+}
+
+static void clear_cores(void)
+{
+	struct core_info *core = &core_info;
+
+	while (core) {
+		cpus_clear(core->mask);
+		core = core->next;
+	}
+}
+
+static union tl_entry *next_tle(union tl_entry *tle)
+{
+	if (tle->nl)
+		return (union tl_entry *)((struct tl_container *)tle + 1);
+	else
+		return (union tl_entry *)((struct tl_cpu *)tle + 1);
+}
+
+static void tl_to_cores(struct tl_info *info)
+{
+	union tl_entry *tle, *end;
+	struct core_info *core = &core_info;
+
+	mutex_lock(&smp_cpu_state_mutex);
+	clear_cores();
+	tle = (union tl_entry *)&info->tle;
+	end = (union tl_entry *)((unsigned long)info + info->length);
+	while (tle < end) {
+		switch (tle->nl) {
+		case 5:
+		case 4:
+		case 3:
+		case 2:
+			break;
+		case 1:
+			core = core->next;
+			break;
+		case 0:
+			add_cpus_to_core(&tle->cpu, core);
+			break;
+		default:
+			clear_cores();
+			machine_has_topology = 0;
+			return;
+		}
+		tle = next_tle(tle);
+	}
+	mutex_unlock(&smp_cpu_state_mutex);
+}
+
+static int ptf(void)
+{
+	int rc;
+
+	asm volatile(
+		"	.insn	rre,0xb9a20000,%1,%1\n"
+		"	ipm	%0\n"
+		"	srl	%0,28\n"
+		: "=d" (rc)
+		: "d" (2UL)  : "cc");
+	return rc;
+}
+
+void arch_update_cpu_topology(void)
+{
+	struct tl_info *info = tl_info;
+	struct sys_device *sysdev;
+	int cpu;
+
+	if (!machine_has_topology)
+		return;
+	ptf();
+	stsi(info, 15, 1, 2);
+	tl_to_cores(info);
+	for_each_online_cpu(cpu) {
+		sysdev = get_cpu_sysdev(cpu);
+		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
+	}
+}
+
+static int topology_kthread(void *data)
+{
+	arch_reinit_sched_domains();
+	return 0;
+}
+
+static void topology_work_fn(void *data)
+{
+	/* We can't call arch_reinit_sched_domains() from a multi-threaded
+	 * workqueue context since it may deadlock in case of cpu hotplug.
+	 * So we have to create a kernel thread in order to call
+	 * arch_reinit_sched_domains().
+	 */
+	kthread_run(topology_kthread, NULL, "topology_update");
+}
+
+static void topology_timer_fn(unsigned long ignored)
+{
+	if (ptf())
+		schedule_work(&topology_work);
+	set_topology_timer();
+}
+
+static void set_topology_timer(void)
+{
+	topology_timer.function = topology_timer_fn;
+	topology_timer.data = 0;
+	topology_timer.expires = jiffies + 60 * HZ;
+	add_timer(&topology_timer);
+}
+
+static void topology_interrupt(struct pt_regs *regs, __u16 code)
+{
+	schedule_work(&topology_work);
+}
+
+static int __init init_topology_update(void)
+{
+	int rc;
+
+	if (!machine_has_topology)
+		return 0;
+	init_timer(&topology_timer);
+	if (machine_has_topology_irq) {
+		rc = register_external_interrupt(0x2005, topology_interrupt);
+		if (rc)
+			return rc;
+		ctl_set_bit(0, 8);
+	}
+	else
+		set_topology_timer();
+	return 0;
+}
+__initcall(init_topology_update);
+
+void __init s390_init_cpu_topology(void)
+{
+	unsigned long long facility_bits;
+	struct tl_info *info;
+	struct core_info *core;
+	int nr_cores;
+	int i;
+
+	INIT_WORK(&topology_work, topology_work_fn, NULL);
+	if (stfle(&facility_bits, 1) <= 0)
+		return;
+	if (!(facility_bits & (1ULL << 52)))
+		return;
+	machine_has_topology = 1;
+
+	if (facility_bits & (1ULL << 51))
+		machine_has_topology_irq = 1;
+
+	tl_info = alloc_bootmem_pages(PAGE_SIZE);
+	if (!tl_info)
+		goto error;
+	info = tl_info;
+	stsi(info, 15, 1, 2);
+
+	nr_cores = info->mag[NR_MAG - 2];
+	for (i = 0; i < info->mnest - 2; i++)
+		nr_cores *= info->mag[NR_MAG - 3 - i];
+
+	printk(KERN_INFO "CPU topology:");
+	for (i = 0; i < NR_MAG; i++)
+		printk(" %d", info->mag[i]);
+	printk(" / %d\n", info->mnest);
+
+	core = &core_info;
+	for (i = 0; i < nr_cores; i++) {
+		core->next = alloc_bootmem(sizeof(struct core_info));
+		core = core->next;
+		if (!core)
+			goto error;
+	}
+	return;
+error:
+	machine_has_topology = 0;
+	machine_has_topology_irq = 0;
+}
diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h
index 1604004..589b7d4 100644
--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -434,6 +434,9 @@ __set_psw_mask(unsigned long mask)
 #define local_mcck_enable()  __set_psw_mask(PSW_KERNEL_BITS)
 #define local_mcck_disable() __set_psw_mask(PSW_KERNEL_BITS & ~PSW_MASK_MCHECK)
 
+int stsi(void *sysinfo, int fc, int sel1, int sel2);
+int stfle(unsigned long long *list, int doublewords);
+
 #ifdef CONFIG_SMP
 
 extern void smp_ctl_set_bit(int cr, int bit);
diff --git a/include/asm-s390/topology.h b/include/asm-s390/topology.h
index 613aa64..f4f57fa 100644
--- a/include/asm-s390/topology.h
+++ b/include/asm-s390/topology.h
@@ -1,6 +1,20 @@
 #ifndef _ASM_S390_TOPOLOGY_H
 #define _ASM_S390_TOPOLOGY_H
 
+#include <linux/cpumask.h>
+
+#define mc_capable()    (1)
+
+cpumask_t cpu_coregroup_map(unsigned int cpu);
+
+#ifdef CONFIG_SMP
+void s390_init_cpu_topology(void);
+#else
+static inline void s390_init_cpu_topology(void)
+{
+};
+#endif
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c7502a6..a4c117d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -744,6 +744,8 @@ extern int partition_sched_domains(cpumask_t *partition1,
  */
 extern unsigned int max_cache_size;
 
+extern int arch_reinit_sched_domains(void);
+
 #endif	/* CONFIG_SMP */
 
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index ec1eca8..b47ae36 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -50,6 +50,8 @@
 	for_each_online_node(node)						\
 		if (nr_cpus_node(node))
 
+void arch_update_cpu_topology(void);
+
 #ifndef node_distance
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
diff --git a/kernel/sched.c b/kernel/sched.c
index 91ae461..87f38c5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6625,6 +6625,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	arch_destroy_sched_domains(cpu_map);
 }
 
+void __attribute__((weak)) arch_update_cpu_topology(void)
+{
+}
+
 /*
  * Partition sched domains as specified by the cpumasks below.
  * This attaches all cpus from the cpumasks to the NULL domain,
@@ -6842,6 +6846,7 @@ void __init sched_init(void)
 	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
 
+	arch_update_cpu_topology();
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */