Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 498

kernel-2.6.18-238.el5.src.rpm

From: Doug Chapman <dchapman@redhat.com>
Date: Thu, 6 Dec 2007 16:14:01 -0500
Subject: [cpufreq] rewrite lock to eliminate hotplug issues
Message-id: 1196975641.1083.3.camel@athlon
O-Subject: [RHEL 5.2 PATCH 1/5] hang at 'Disabling ondemand cpu frequency scaling'
Bugzilla: 253416

Patch 1/5 for BZ 253416

Backport of:

commit 5a01f2e8f3ac134e24144d74bb48a60236f7024d
Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date:   Mon Feb 5 16:12:44 2007 -0800

    [CPUFREQ] Rewrite lock in cpufreq to eliminate cpufreq/hotplug related issues

    Yet another attempt to resolve cpufreq and hotplug locking issues.

    Patchset has 3 patches:
    * Rewrite the lock infrastructure of cpufreq using a per cpu rwsem.
    * Minor restructuring of work callback in ondemand driver.
    * Use the new cpufreq rwsem infrastructure in ondemand work.

    This patch:

    Convert policy->lock to rwsem and move it to per_cpu area.
    This rwsem will protect against both changing/accessing policy
    related parameters and CPU hot plug/unplug.

Acked-by: Brian Maly <bmaly@redhat.com>
Acked-by: Prarit Bhargava <prarit@redhat.com>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b3df613..66b274d 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -40,8 +40,67 @@ static struct cpufreq_driver *cpufreq_driver;
 static struct cpufreq_policy *cpufreq_cpu_data[NR_CPUS];
 static DEFINE_SPINLOCK(cpufreq_driver_lock);
 
+/*
+ * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
+ * all cpufreq/hotplug/workqueue/etc related lock issues.
+ *
+ * The rules for this semaphore:
+ * - Any routine that wants to read from the policy structure will
+ *   do a down_read on this semaphore.
+ * - Any routine that will write to the policy structure and/or may take away
+ *   the policy altogether (eg. CPU hotplug), will hold this lock in write
+ *   mode before doing so.
+ *
+ * Additional rules:
+ * - All holders of the lock should check to make sure that the CPU they
+ *   are concerned with are online after they get the lock.
+ * - Governor routines that can be called in cpufreq hotplug path should not
+ *   take this sem as top level hotplug notifier handler takes this.
+ */
+static DEFINE_PER_CPU(int, policy_cpu);
+static DEFINE_PER_CPU(struct rw_semaphore, cpu_policy_rwsem);
+
+#define lock_policy_rwsem(mode, cpu)                                   \
+int lock_policy_rwsem_##mode                                           \
+(int cpu)                                                              \
+{                                                                      \
+	int policy_cpu = per_cpu(policy_cpu, cpu);                      \
+	BUG_ON(policy_cpu == -1);                                       \
+	down_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu));            \
+	if (unlikely(!cpu_online(cpu))) {                               \
+		up_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu));      \
+		return -1;                                              \
+	}                                                               \
+									\
+	return 0;                                                       \
+}
+
+lock_policy_rwsem(read, cpu);
+EXPORT_SYMBOL_GPL(lock_policy_rwsem_read);
+
+lock_policy_rwsem(write, cpu);
+EXPORT_SYMBOL_GPL(lock_policy_rwsem_write);
+
+void unlock_policy_rwsem_read(int cpu)
+{
+	int policy_cpu = per_cpu(policy_cpu, cpu);
+	BUG_ON(policy_cpu == -1);
+	up_read(&per_cpu(cpu_policy_rwsem, policy_cpu));
+}
+EXPORT_SYMBOL_GPL(unlock_policy_rwsem_read);
+
+void unlock_policy_rwsem_write(int cpu)
+{
+	int policy_cpu = per_cpu(policy_cpu, cpu);
+	BUG_ON(policy_cpu == -1);
+	up_write(&per_cpu(cpu_policy_rwsem, policy_cpu));
+}
+EXPORT_SYMBOL_GPL(unlock_policy_rwsem_write);
+
+
 /* internal prototypes */
 static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
+static unsigned int __cpufreq_get(unsigned int cpu);
 static void handle_update(void *data);
 
 /**
@@ -395,10 +454,8 @@ static ssize_t store_##file_name					\
 		return -EINVAL;						\
 									\
 	lock_cpu_hotplug();						\
-	mutex_lock(&policy->lock);					\
 	ret = __cpufreq_set_policy(policy, &new_policy);		\
 	policy->user_policy.object = policy->object;			\
-	mutex_unlock(&policy->lock);					\
 	unlock_cpu_hotplug();						\
 									\
 	return ret ? ret : count;					\
@@ -412,7 +469,7 @@ store_one(scaling_max_freq,max);
  */
 static ssize_t show_cpuinfo_cur_freq (struct cpufreq_policy * policy, char *buf)
 {
-	unsigned int cur_freq = cpufreq_get(policy->cpu);
+	unsigned int cur_freq = __cpufreq_get(policy->cpu);
 	if (!cur_freq)
 		return sprintf(buf, "<unknown>");
 	return sprintf(buf, "%u\n", cur_freq);
@@ -459,12 +516,10 @@ static ssize_t store_scaling_governor (struct cpufreq_policy * policy,
 
 	/* Do not use cpufreq_set_policy here or the user_policy.max
 	   will be wrongly overridden */
-	mutex_lock(&policy->lock);
 	ret = __cpufreq_set_policy(policy, &new_policy);
 
 	policy->user_policy.policy = policy->policy;
 	policy->user_policy.governor = policy->governor;
-	mutex_unlock(&policy->lock);
 
 	unlock_cpu_hotplug();
 
@@ -568,7 +623,14 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr ,char * buf)
 	policy = cpufreq_cpu_get(policy->cpu);
 	if (!policy)
 		return -EINVAL;
+
+	if (lock_policy_rwsem_read(policy->cpu) < 0)
+		return -EINVAL;
+
 	ret = fattr->show ? fattr->show(policy,buf) : -EIO;
+
+	unlock_policy_rwsem_read(policy->cpu);
+
 	cpufreq_cpu_put(policy);
 	return ret;
 }
@@ -582,7 +644,14 @@ static ssize_t store(struct kobject * kobj, struct attribute * attr,
 	policy = cpufreq_cpu_get(policy->cpu);
 	if (!policy)
 		return -EINVAL;
+
+	if (lock_policy_rwsem_write(policy->cpu) < 0)
+		return -EINVAL;
+
 	ret = fattr->store ? fattr->store(policy,buf,count) : -EIO;
+
+	unlock_policy_rwsem_write(policy->cpu);
+
 	cpufreq_cpu_put(policy);
 	return ret;
 }
@@ -656,8 +725,10 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 	policy->cpu = cpu;
 	policy->cpus = cpumask_of_cpu(cpu);
 
-	mutex_init(&policy->lock);
-	mutex_lock(&policy->lock);
+	/* Initially set CPU itself as the policy_cpu */
+	per_cpu(policy_cpu, cpu) = cpu;
+	lock_policy_rwsem_write(cpu);
+
 	init_completion(&policy->kobj_unregister);
 	INIT_WORK(&policy->update, handle_update, (void *)(long)cpu);
 
@@ -667,7 +738,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 	ret = cpufreq_driver->init(policy);
 	if (ret) {
 		dprintk("initialization failed\n");
-		mutex_unlock(&policy->lock);
+		unlock_policy_rwsem_write(cpu);
 		goto err_out;
 	}
 
@@ -681,6 +752,14 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 		 */
 		managed_policy = cpufreq_cpu_get(j);
 		if (unlikely(managed_policy)) {
+
+			/* Set proper policy_cpu */
+			unlock_policy_rwsem_write(cpu);
+			per_cpu(policy_cpu, cpu) = managed_policy->cpu;
+
+			if (lock_policy_rwsem_write(cpu) < 0)
+				goto err_out_driver_exit;
+
 			spin_lock_irqsave(&cpufreq_driver_lock, flags);
 			managed_policy->cpus = policy->cpus;
 			cpufreq_cpu_data[cpu] = managed_policy;
@@ -691,8 +770,8 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 					  &managed_policy->kobj, "cpufreq");
 
 			cpufreq_debug_enable_ratelimit();
-			mutex_unlock(&policy->lock);
 			ret = 0;
+			unlock_policy_rwsem_write(cpu);
 			goto err_out_driver_exit; /* call driver->exit() */
 		}
 	}
@@ -706,7 +785,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 
 	ret = kobject_register(&policy->kobj);
 	if (ret) {
-		mutex_unlock(&policy->lock);
+		unlock_policy_rwsem_write(cpu);
 		goto err_out_driver_exit;
 	}
 	/* set up files for this cpu device */
@@ -721,8 +800,10 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 		sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-	for_each_cpu_mask(j, policy->cpus)
+	for_each_cpu_mask(j, policy->cpus) {
 		cpufreq_cpu_data[j] = policy;
+		per_cpu(policy_cpu, j) = policy->cpu;
+	}
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
 	/* symlink affected CPUs */
@@ -741,7 +822,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 
 	policy->governor = NULL; /* to assure that the starting sequence is
 				  * run in cpufreq_set_policy */
-	mutex_unlock(&policy->lock);
+	unlock_policy_rwsem_write(cpu);
 
 	/* set default policy */
 	ret = cpufreq_set_policy(&new_policy);
@@ -782,11 +863,13 @@ module_out:
 
 
 /**
- * cpufreq_remove_dev - remove a CPU device
+ * __cpufreq_remove_dev - remove a CPU device
  *
  * Removes the cpufreq interface for a CPU device.
+ * Caller should already have policy_rwsem in write mode for this CPU.
+ * This routine frees the rwsem before returning.
  */
-static int cpufreq_remove_dev (struct sys_device * sys_dev)
+static int __cpufreq_remove_dev (struct sys_device * sys_dev)
 {
 	unsigned int cpu = sys_dev->id;
 	unsigned long flags;
@@ -805,6 +888,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
 	if (!data) {
 		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 		cpufreq_debug_enable_ratelimit();
+		unlock_policy_rwsem_write(cpu);
 		return -EINVAL;
 	}
 	cpufreq_cpu_data[cpu] = NULL;
@@ -821,6 +905,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
 		sysfs_remove_link(&sys_dev->kobj, "cpufreq");
 		cpufreq_cpu_put(data);
 		cpufreq_debug_enable_ratelimit();
+		unlock_policy_rwsem_write(cpu);
 		return 0;
 	}
 #endif
@@ -829,6 +914,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
 	if (!kobject_get(&data->kobj)) {
 		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 		cpufreq_debug_enable_ratelimit();
+		unlock_policy_rwsem_write(cpu);
 		return -EFAULT;
 	}
 
@@ -862,10 +948,10 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 #endif
 
-	mutex_lock(&data->lock);
 	if (cpufreq_driver->target)
 		__cpufreq_governor(data, CPUFREQ_GOV_STOP);
-	mutex_unlock(&data->lock);
+
+	unlock_policy_rwsem_write(cpu);
 
 	kobject_unregister(&data->kobj);
 
@@ -888,6 +974,17 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
 	return 0;
 }
 
+static int cpufreq_remove_dev (struct sys_device * sys_dev)
+{
+	unsigned int cpu = sys_dev->id;
+	int retval;
+	if (unlikely(lock_policy_rwsem_write(cpu)))
+		BUG();
+
+	retval = __cpufreq_remove_dev(sys_dev);
+	return retval;
+}
+
 
 static void handle_update(void *data)
 {
@@ -933,9 +1030,12 @@ unsigned int cpufreq_quick_get(unsigned int cpu)
 	unsigned int ret = 0;
 
 	if (policy) {
-		mutex_lock(&policy->lock);
+		if (unlikely(lock_policy_rwsem_read(cpu)))
+			return ret;
+
 		ret = policy->cur;
-		mutex_unlock(&policy->lock);
+
+		unlock_policy_rwsem_read(cpu);
 		cpufreq_cpu_put(policy);
 	}
 
@@ -944,24 +1044,13 @@ unsigned int cpufreq_quick_get(unsigned int cpu)
 EXPORT_SYMBOL(cpufreq_quick_get);
 
 
-/**
- * cpufreq_get - get the current CPU frequency (in kHz)
- * @cpu: CPU number
- *
- * Get the CPU current (static) CPU frequency
- */
-unsigned int cpufreq_get(unsigned int cpu)
+static unsigned int __cpufreq_get(unsigned int cpu)
 {
-	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+	struct cpufreq_policy *policy = cpufreq_cpu_data[cpu];
 	unsigned int ret = 0;
 
-	if (!policy)
-		return 0;
-
 	if (!cpufreq_driver->get)
-		goto out;
-
-	mutex_lock(&policy->lock);
+		return (ret);
 
 	ret = cpufreq_driver->get(cpu);
 
@@ -973,16 +1062,40 @@ unsigned int cpufreq_get(unsigned int cpu)
 		}
 	}
 
-	mutex_unlock(&policy->lock);
+	return (ret);
+}
 
-out:
-	cpufreq_cpu_put(policy);
 
-	return (ret);
+/**
+ * cpufreq_get - get the current CPU frequency (in kHz)
+ * @cpu: CPU number
+ *
+ * Get the CPU current (static) CPU frequency
+ */
+unsigned int cpufreq_get(unsigned int cpu)
+{
+	unsigned int ret_freq = 0;
+	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+
+	if (!policy)
+		goto out;
+
+	if (unlikely(lock_policy_rwsem_read(cpu)))
+		goto out_policy;
+
+	ret_freq = __cpufreq_get(cpu);
+
+	unlock_policy_rwsem_read(cpu);
+
+out_policy:
+	cpufreq_cpu_put(policy);
+out:
+	return (ret_freq);
 }
 EXPORT_SYMBOL(cpufreq_get);
 
 
+
 /**
  *	cpufreq_suspend - let the low level driver prepare for suspend
  */
@@ -1256,11 +1369,12 @@ int cpufreq_driver_target(struct cpufreq_policy *policy,
 		return -EINVAL;
 
 	lock_cpu_hotplug();
-	mutex_lock(&policy->lock);
+	if (unlikely(lock_policy_rwsem_write(policy->cpu)))
+		return -EINVAL;
 
 	ret = __cpufreq_driver_target(policy, target_freq, relation);
 
-	mutex_unlock(&policy->lock);
+	unlock_policy_rwsem_write(policy->cpu);
 	unlock_cpu_hotplug();
 
 	cpufreq_cpu_put(policy);
@@ -1348,9 +1462,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
 	if (!cpu_policy)
 		return -EINVAL;
 
-	mutex_lock(&cpu_policy->lock);
 	memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy));
-	mutex_unlock(&cpu_policy->lock);
 
 	cpufreq_cpu_put(cpu_policy);
 	return 0;
@@ -1462,8 +1574,9 @@ int cpufreq_set_policy(struct cpufreq_policy *policy)
 
 	lock_cpu_hotplug();
 
-	/* lock this CPU */
-	mutex_lock(&data->lock);
+	if (unlikely(lock_policy_rwsem_write(policy->cpu)))
+		return -EINVAL;
+
 
 	ret = __cpufreq_set_policy(data, policy);
 	data->user_policy.min = data->min;
@@ -1471,7 +1584,7 @@ int cpufreq_set_policy(struct cpufreq_policy *policy)
 	data->user_policy.policy = data->policy;
 	data->user_policy.governor = data->governor;
 
-	mutex_unlock(&data->lock);
+	unlock_policy_rwsem_write(policy->cpu);
 
 	unlock_cpu_hotplug();
 	cpufreq_cpu_put(data);
@@ -1498,7 +1611,8 @@ int cpufreq_update_policy(unsigned int cpu)
 		return -ENODEV;
 
 	lock_cpu_hotplug();
-	mutex_lock(&data->lock);
+	if (unlikely(lock_policy_rwsem_write(cpu)))
+		return -EINVAL;
 
 	dprintk("updating policy for CPU %u\n", cpu);
 	memcpy(&policy, data, sizeof(struct cpufreq_policy));
@@ -1522,7 +1636,8 @@ int cpufreq_update_policy(unsigned int cpu)
 
 	ret = __cpufreq_set_policy(data, &policy);
 
-	mutex_unlock(&data->lock);
+	unlock_policy_rwsem_write(cpu);
+
 	unlock_cpu_hotplug();
 	cpufreq_cpu_put(data);
 	return ret;
@@ -1551,14 +1666,18 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
 			 * hardware-managed P-State to switch other related
 			 * threads to min or higher speeds if possible.
 			 */
+			if (unlikely(lock_policy_rwsem_write(cpu)))
+				BUG();
+
 			policy = cpufreq_cpu_data[cpu];
 			if (policy) {
-				cpufreq_driver_target(policy, policy->min,
+				__cpufreq_driver_target(policy, policy->min,
 						CPUFREQ_RELATION_H);
 			}
+			__cpufreq_remove_dev(sys_dev);
 			break;
-		case CPU_DEAD:
-			cpufreq_remove_dev(sys_dev);
+		case CPU_DOWN_FAILED:
+			cpufreq_add_dev(sys_dev);
 			break;
 		}
 	}
@@ -1671,3 +1790,16 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
+
+static int __init cpufreq_core_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(policy_cpu, cpu) = -1;
+		init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));
+	}
+	return 0;
+}
+
+core_initcall(cpufreq_core_init);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4ea39fe..84fc92e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -84,9 +84,6 @@ struct cpufreq_policy {
         unsigned int		policy; /* see above */
 	struct cpufreq_governor	*governor; /* see below */
 
- 	struct mutex		lock;   /* CPU ->setpolicy or ->target may
-					   only be called once a time */
-
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
 
@@ -175,6 +172,11 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy,
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
+int lock_policy_rwsem_read(int cpu);
+int lock_policy_rwsem_write(int cpu);
+void unlock_policy_rwsem_read(int cpu);
+void unlock_policy_rwsem_write(int cpu);
+
 
 /*********************************************************************
  *                      CPUFREQ DRIVER INTERFACE                     *