Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > eab357269fb8735c5e1a2938e6c77cae > files > 2366

kernel-2.6.18-164.10.1.el5.src.rpm

From: AMEET M. PARANJAPE <aparanja@redhat.com>
Date: Tue, 31 Mar 2009 11:05:30 -0400
Subject: [ppc] implement a quota system for MSIs
Message-id: 20090331150318.14954.49408.sendpatchset@squad5-lp1.lab.bos.redhat.com
O-Subject: [PATCH RHEL5.4 BZ492580 7/8]
Bugzilla: 492580
RH-Acked-by: David Howells <dhowells@redhat.com>

RHBZ#:
======
https://bugzilla.redhat.com/show_bug.cgi?id=492580

Description:
===========
There are hardware limitations on the number of available MSIs,
which firmware expresses using a property named "ibm,pe-total-#msi".
This property tells us how many MSIs are available for devices below
the point in the PCI tree where we find the property.

For old firmwares which don't have the property, we assume there are
8 MSIs available per "partitionable endpoint" (PE). The PE can be
found using existing EEH code, which uses the methods described in
PAPR. For our purposes we want the parent of the node that's
identified using this method.

When a driver requests n MSIs for a device, we first establish where
the "ibm,pe-total-#msi" property above that device is, or we find the
PE if the property is not found. In both cases we call this node
the "pe_dn".

We then count all non-bridge devices below the pe_dn, to establish
how many devices in total may need MSIs. The quota is then simply the
total available divided by the number of devices, if the request is
less than or equal to the quota, the request is fine and we're done.

If the request is greater than the quota, we try to determine if there
are any "spare" MSIs which we can give to this device. Spare MSIs are
found by looking for other devices which can never use their full
quota, because their "req#msi(-x)" property is less than the quota.

If we find any spare, we divide the spares by the number of devices
that could request more than their quota. This ensures the spare
MSIs are spread evenly amongst all over-quota requestors.

RHEL Version Found:
================
RHEL 5.3

kABI Status:
============
No symbols were harmed.

Brew:
=====
Built on all platforms.
http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1744271

Upstream Status:
================
http://git.kernel.org/?p=linux/kernel/git/benh/powerpc.git;a=commit;h=448e2ca0e32a5c437650d634b6032ab732662338

Test Status:
============
There is a testcase provided in the Bugzilla.  A 'cat /proc/interrupts' should
show the device drivers has two interrupt entries for each port.  Run I/O
tests.  Then 'lspci -vv' should display the correct pci capabilities for the
installed adapter and show the adapter is using MSI-X interrupts opposed to MSO
or LSI interrupts.

===============================================================
Ameet Paranjape 978-392-3903 ext 23903
IBM on-site partner

Proposed Patch:
===============

diff --git a/arch/powerpc/kernel/msi-rtas.c b/arch/powerpc/kernel/msi-rtas.c
index 05e7b02..9de7a33 100644
--- a/arch/powerpc/kernel/msi-rtas.c
+++ b/arch/powerpc/kernel/msi-rtas.c
@@ -172,12 +172,189 @@ static int check_req_msix(struct pci_dev *pdev, int nvec)
 	return check_req(pdev, nvec, "ibm,req#msi-x");
 }
 
+/* Quota calculation */
+
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+	struct device_node *dn, *parent;
+	const u32 *p;
+
+	dn = of_node_get(pci_device_to_OF_node(dev));
+	while (dn) {
+		p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
+		if (p) {
+			pr_debug("rtas_msi: found prop on dn %s\n",
+				dn->full_name);
+			*total = *p;
+			return dn;
+		}
+
+		parent = of_get_parent(dn);
+		of_node_put(dn);
+		dn = parent;
+	}
+
+	return NULL;
+}
+
+static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
+{
+	struct device_node *dn;
+
+	/* Found our PE and assume 8 at that point. */
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn)
+		return NULL;
+
+	dn = find_device_pe(dn);
+	if (!dn)
+		return NULL;
+
+	/* We actually want the parent */
+	dn = of_get_parent(dn);
+	if (!dn)
+		return NULL;
+
+	/* Hardcode of 8 for old firmwares */
+	*total = 8;
+	pr_debug("rtas_msi: using PE dn %s\n", dn->full_name);
+
+	return dn;
+}
+
+struct msi_counts {
+	struct device_node *requestor;
+	int num_devices;
+	int request;
+	int quota;
+	int spare;
+	int over_quota;
+};
+
+static void *count_non_bridge_devices(struct device_node *dn, void *data)
+{
+	struct msi_counts *counts = data;
+	const u32 *p;
+	u32 class;
+
+	pr_debug("rtas_msi: counting %s\n", dn->full_name);
+
+	p = of_get_property(dn, "class-code", NULL);
+	class = p ? *p : 0;
+
+	if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
+		counts->num_devices++;
+
+	return NULL;
+}
+
+static void *count_spare_msis(struct device_node *dn, void *data)
+{
+	struct msi_counts *counts = data;
+	const u32 *p;
+	int req;
+
+	if (dn == counts->requestor)
+		req = counts->request;
+	else {
+		/* We don't know if a driver will try to use MSI or MSI-X,
+		 * so we just have to punt and use the larger of the two. */
+		req = 0;
+		p = of_get_property(dn, "ibm,req#msi", NULL);
+		if (p)
+			req = *p;
+
+		p = of_get_property(dn, "ibm,req#msi-x", NULL);
+		if (p)
+			req = max(req, (int)*p);
+	}
+
+	if (req < counts->quota)
+		counts->spare += counts->quota - req;
+	else if (req > counts->quota)
+		counts->over_quota++;
+
+	return NULL;
+}
+
+static int msi_quota_for_device(struct pci_dev *dev, int request)
+{
+	struct device_node *pe_dn;
+	struct msi_counts counts;
+	int total;
+
+	pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev),
+		  request);
+
+	pe_dn = find_pe_total_msi(dev, &total);
+	if (!pe_dn)
+		pe_dn = find_pe_dn(dev, &total);
+
+	if (!pe_dn) {
+		printk(KERN_ERR "rtas_msi: couldn't find PE for %s\n",
+		       pci_name(dev));
+		goto out;
+	}
+
+	pr_debug("rtas_msi: found PE %s\n", pe_dn->full_name);
+
+	memset(&counts, 0, sizeof(struct msi_counts));
+
+	/* Work out how many devices we have below this PE */
+	traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts);
+
+	if (counts.num_devices == 0) {
+		printk(KERN_ERR "rtas_msi: found 0 devices under PE for %s\n",
+		       pci_name(dev));
+		goto out;
+	}
+
+	counts.quota = total / counts.num_devices;
+	if (request <= counts.quota)
+		goto out;
+
+	/* else, we have some more calculating to do */
+	counts.requestor = pci_device_to_OF_node(dev);
+	counts.request = request;
+	traverse_pci_devices(pe_dn, count_spare_msis, &counts);
+
+	/* If the quota isn't an integer multiple of the total, we can
+	 * use the remainder as spare MSIs for anyone that wants them. */
+	counts.spare += total % counts.num_devices;
+
+	/* Divide any spare by the number of over-quota requestors */
+	if (counts.over_quota)
+		counts.quota += counts.spare / counts.over_quota;
+
+	/* And finally clamp the request to the possibly adjusted quota */
+	request = min(counts.quota, request);
+
+	pr_debug("rtas_msi: request clamped to quota %d\n", request);
+out:
+	of_node_put(pe_dn);
+
+	return request;
+}
+
 static int rtas_msi_check_device(struct pci_dev *pdev, int nvec, int type)
 {
+	int quota, rc;
+
 	if (type == PCI_CAP_ID_MSIX)
-		return check_req_msix(pdev, nvec);
+		rc = check_req_msix(pdev, nvec);
+	else
+		rc = check_req_msi(pdev, nvec);
+
+	if (rc)
+		return rc;
 
-	return check_req_msi(pdev, nvec);
+	quota = msi_quota_for_device(pdev, nvec);
+
+	if (quota && quota < nvec)
+		return quota;
+
+	return 0;
 }
 
 static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)