Sophie: kernel-2.6.18-128.1.10.el5 src

kernel-2.6.18-128.1.10.el5.src.rpm

From: Kei Tokunaga <ktokunag@redhat.com>
Date: Fri, 16 May 2008 11:06:43 -0400
Subject: [sched] domain range turnable params for wakeup_idle
Message-id: 482DA303.80603@redhat.com
O-Subject: [RHEL5.3][PATCH] add turnable params to configure sched domain range for wakeup_idle()
Bugzilla: 426971
RH-Acked-by: Peter Zijlstra <pzijlstr@redhat.com>
RH-Acked-by: Larry Woodman <lwoodman@redhat.com>

bz426971
https://bugzilla.redhat.com/show_bug.cgi?id=426971

Description
===========
  WHAT THIS PATCH DOES: is to add tunable params (cpuset interface and
  a boot parameter) so you can dynamically or statically configure the
  sched domain ranges where wake_idle() searches for a idle task being
  waken up.  The default behavior will stay in the same as previous
  RHEL5 releases.

  THE BENEFIT OF THIS PATCH: is that you can expect to get better response
  time on your systems by configuring the search range wider.

  MORE BACKGROUND: The load balancer on idle CPUs is called and it tries
  to balance the load in the system wide, but it's only called per tick
  (1msec on RHEL5.x.)  The balancer also is called at the time when a
  idle task is waken up, but it only tries to balance the load in a limited
  sched domain range.  These in some situations increases the system response
  time because a waked-up task gets to wait longer to run.  Changing the
  search range to be wider for the task, it gets more chance to find better
  (or best) CPUs to run sooner.

  The default range for searching is "cores in a package."  With the cpuset
  interface 'sched_relax_domain_level' or a boot parameter 'relax_domain_level,'
  you can change the range to:
    - no request.  use system default or follow request of others (-1, default)
    - no search. (0)
    - siblings. (1)
    - cores in a package. (2)
    - cpus in a node [= system wide on non-NUMA systems] (3)
   (- nodes in a chunk of node [on NUMA systems] (4))
   (- system wide [on NUMA systems] (5~))

kABI status
===========
  No kABI breakage found.

Brew status
===========
  Built on all platforms.
  http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1317110

Upstream status
===============
  Upstream (2.6.26-rc1).

Test status
===========
  Confirmed there is no functional or performance regression.
  We ran a in-house benchmark program, which invokes many threads that
  have a rather short life, on the kernel with relax_domain_level=4.
  The average processing time of the tasks was shortened in the case
  because making the search range wider makes the response time and
  the CPU usage better.

Additional notes
================
  The patch applies to 2.6.18-92.el5.

  Special thanks to Peter Zijlstra, who gave us a lot of valuable advice,
  idea and help on implementing the patch and on working upstream.

Thanks,
Kei

diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 937c212..7d12dc6 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -92,7 +92,7 @@ void build_cpu_to_node_map(void);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 0, /* unused */	\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 6e7a2e9..a9ddeeb 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -39,7 +39,7 @@ extern int __node_distance(int, int);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 0, 			\
+	.newidle_idx		= 2, 			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a4c117d..1d451e0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -662,6 +662,7 @@ enum idle_type
 #define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
 #define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
 #define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
+#define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
 
 #define BALANCE_FOR_POWER	((sched_mc_power_savings || sched_smt_power_savings) \
 				 ? SD_POWERSAVINGS_BALANCE : 0)
@@ -678,6 +679,16 @@ struct sched_group {
 	unsigned long cpu_power;
 };
 
+enum sched_domain_level {
+	SD_LV_NONE = 0,
+	SD_LV_SIBLING,
+	SD_LV_MC,
+	SD_LV_CPU,
+	SD_LV_NODE,
+	SD_LV_ALLNODES,
+	SD_LV_MAX
+};
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c14ebfe..038dc07 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -99,6 +99,9 @@ struct cpuset {
 	int mems_generation;
 
 	struct fmeter fmeter;		/* memory_pressure filter */
+
+	/* for custom sched domain */
+	int relax_domain_level;
 };
 
 /* bits in struct cpuset flags field */
@@ -751,6 +754,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	return 0;
 }
 
+extern int __partition_sched_domains(cpumask_t *dom1, cpumask_t *dom2,
+					int *attr1, int *attr2);
+
 /*
  * For a given cpuset cur, partition the system as follows
  * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
@@ -802,7 +808,8 @@ static void update_cpu_domains(struct cpuset *cur)
 	}
 
 	lock_cpu_hotplug();
-	partition_sched_domains(&pspan, &cspan);
+	__partition_sched_domains(&pspan, &cspan, &par->relax_domain_level,
+						  &cur->relax_domain_level);
 	unlock_cpu_hotplug();
 }
 
@@ -1025,6 +1032,40 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
 	return 0;
 }
 
+static int update_relax_domain_level(struct cpuset *cs, char *buf)
+{
+	struct cpuset *c;
+	cpumask_t span, dummy;
+	int val = simple_strtol(buf, NULL, 10);
+
+	if (val < -1 || val >= SD_LV_MAX)
+		return -EINVAL;
+
+	if (val != cs->relax_domain_level) {
+		cs->relax_domain_level = val;
+
+		/* top_cpuset always has associated sched_domain */
+		if (cs == &top_cpuset) {
+			span = cs->cpus_allowed;
+			list_for_each_entry(c, &cs->children, sibling) {
+				if (is_cpu_exclusive(c))
+					cpus_andnot(span, span,
+							c->cpus_allowed);
+			}
+			dummy = CPU_MASK_NONE;
+			lock_cpu_hotplug();
+			__partition_sched_domains(&span, &dummy,
+						&cs->relax_domain_level, NULL);
+			unlock_cpu_hotplug();
+		} else {
+			/* cpuset with cpu_exclusive has sched_domain */
+			if (is_cpu_exclusive(cs))
+				update_cpu_domains(cs);
+		}
+	}
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -1264,6 +1305,7 @@ typedef enum {
 	FILE_MEMLIST,
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
+	FILE_SCHED_RELAX_DOMAIN_LEVEL,
 	FILE_NOTIFY_ON_RELEASE,
 	FILE_MEMORY_PRESSURE_ENABLED,
 	FILE_MEMORY_PRESSURE,
@@ -1316,6 +1358,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_MEM_EXCLUSIVE:
 		retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
 		break;
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		retval = update_relax_domain_level(cs, buffer);
+		break;
 	case FILE_NOTIFY_ON_RELEASE:
 		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
 		break;
@@ -1433,6 +1478,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_MEM_EXCLUSIVE:
 		*s++ = is_mem_exclusive(cs) ? '1' : '0';
 		break;
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		s += sprintf(s, "%d", cs->relax_domain_level);
+		break;
 	case FILE_NOTIFY_ON_RELEASE:
 		*s++ = notify_on_release(cs) ? '1' : '0';
 		break;
@@ -1789,6 +1837,11 @@ static struct cftype cft_mem_exclusive = {
 	.private = FILE_MEM_EXCLUSIVE,
 };
 
+static struct cftype cft_sched_relax_domain_level = {
+	.name = "sched_relax_domain_level",
+	.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+};
+
 static struct cftype cft_notify_on_release = {
 	.name = "notify_on_release",
 	.private = FILE_NOTIFY_ON_RELEASE,
@@ -1831,6 +1884,9 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry,
+					&cft_sched_relax_domain_level)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
@@ -1880,6 +1936,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	INIT_LIST_HEAD(&cs->children);
 	cs->mems_generation = cpuset_mems_generation++;
 	fmeter_init(&cs->fmeter);
+	cs->relax_domain_level = -1;
 
 	cs->parent = parent;
 
@@ -2004,6 +2061,7 @@ int __init cpuset_init(void)
 
 	fmeter_init(&top_cpuset.fmeter);
 	top_cpuset.mems_generation = cpuset_mems_generation++;
+	top_cpuset.relax_domain_level = -1;
 
 	init_task.cpuset = &top_cpuset;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 87f38c5..5ba632f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1315,6 +1315,9 @@ nextlevel:
 	return cpu;
 }
 
+static inline int task_hot(struct task_struct *p, unsigned long long now,
+			   struct sched_domain *sd);
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -1331,12 +1334,25 @@ static int wake_idle(int cpu, struct task_struct *p)
 	cpumask_t tmp;
 	struct sched_domain *sd;
 	int i;
+	unsigned long long now;
 
-	if (idle_cpu(cpu))
+	/*
+	 * If it is idle, then it is the best cpu to run this task.
+	 *
+	 * This cpu is also the best, if it has more than one task already.
+	 * Siblings must be also busy(in most cases) as they didn't already
+	 * pickup the extra load from this cpu and hence we need not check
+	 * sibling runqueue info. This will avoid the checks and cache miss
+	 * penalities associated with that.
+	 */
+	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
 		return cpu;
 
+	now = sched_clock();
 	for_each_domain(cpu, sd) {
-		if (sd->flags & SD_WAKE_IDLE) {
+		if ((sd->flags & SD_WAKE_IDLE)
+		    || ((sd->flags & SD_WAKE_IDLE_FAR)
+			&& !task_hot(p, now, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i))
@@ -6245,11 +6261,45 @@ next_sg:
 	}
 }
 
+static int default_relax_domain_level = -1;
+
+static int __init setup_relax_domain_level(char *str)
+{
+	unsigned long val;
+
+	val = simple_strtoul(str, NULL, 0);
+	if (val < SD_LV_MAX)
+		default_relax_domain_level = val;
+
+	return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd, int level, int *attr)
+{
+	int request;
+
+	if (!attr || *attr < 0) {
+		if (default_relax_domain_level < 0)
+			return;
+		else
+			request = default_relax_domain_level;
+	} else
+		request = *attr;
+	if (request < level) {
+		/* turn off idle balance on this domain */
+		sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+	} else {
+		/* turn on idle balance on this domain */
+		sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+	}
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static int build_sched_domains(const cpumask_t *cpu_map)
+static int __build_sched_domains(const cpumask_t *cpu_map, int *attr)
 {
 	int i;
 	struct sched_group *sched_group_phys = NULL;
@@ -6300,6 +6350,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 			}
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
+			set_domain_attribute(sd, SD_LV_ALLNODES, attr);
 			sd->span = *cpu_map;
 			group = cpu_to_allnodes_group(i);
 			sd->groups = &sched_group_allnodes[group];
@@ -6309,6 +6360,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
 		sd = &per_cpu(node_domains, i);
 		*sd = SD_NODE_INIT;
+		set_domain_attribute(sd, SD_LV_NODE, attr);
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
 		cpus_and(sd->span, sd->span, *cpu_map);
@@ -6330,6 +6382,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		sd = &per_cpu(phys_domains, i);
 		group = cpu_to_phys_group(i);
 		*sd = SD_CPU_INIT;
+		set_domain_attribute(sd, SD_LV_CPU, attr);
 		sd->span = nodemask;
 		sd->parent = p;
 		sd->groups = &sched_group_phys[group];
@@ -6351,6 +6404,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		sd = &per_cpu(core_domains, i);
 		group = cpu_to_core_group(i);
 		*sd = SD_MC_INIT;
+		set_domain_attribute(sd, SD_LV_MC, attr);
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
@@ -6362,6 +6416,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		sd = &per_cpu(cpu_domains, i);
 		group = cpu_to_cpu_group(i);
 		*sd = SD_SIBLING_INIT;
+		set_domain_attribute(sd, SD_LV_SIBLING, attr);
 		sd->span = cpu_sibling_map[i];
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
@@ -6586,6 +6641,12 @@ error:
 	free_sched_groups(cpu_map);
 	return -ENOMEM;
 }
+
+static int build_sched_domains(const cpumask_t *cpu_map)
+{
+	return __build_sched_domains(cpu_map, NULL);
+}
+
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
@@ -6637,7 +6698,8 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
  * correct sched domains
  * Call with hotplug lock held
  */
-int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int __partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2,
+				int *attr1, int *attr2)
 {
 	cpumask_t change_map;
 	int err = 0;
@@ -6649,13 +6711,18 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 	/* Detach sched domains from all of the affected cpus */
 	detach_destroy_domains(&change_map);
 	if (!cpus_empty(*partition1))
-		err = build_sched_domains(partition1);
+		err = __build_sched_domains(partition1, attr1);
 	if (!err && !cpus_empty(*partition2))
-		err = build_sched_domains(partition2);
+		err = __build_sched_domains(partition2, attr2);
 
 	return err;
 }
 
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+	return __partition_sched_domains(partition1, partition2, NULL, NULL);
+}
+
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {