Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 2691

kernel-2.6.18-194.11.1.el5.src.rpm

From: Doug Ledford <dledford@redhat.com>
Date: Tue, 14 Apr 2009 15:23:31 -0400
Subject: [openib] core: update core code to OFED 1.4.1-rc3
Message-id: 1239737023-31222-5-git-send-email-dledford@redhat.com
O-Subject: [Patch RHEL5.4 04/16] [infiniband/core] Update core code to OFED 1.4.1-rc3
Bugzilla: 476301

Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index f646040..b2e522d 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -30,3 +30,5 @@ ib_umad-y :=			user_mad.o
 ib_ucm-y :=			ucm.o
 
 ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o
+
+ib_core-y +=			namespace.o writeback.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 485f81e..09a2bec 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -4,28 +4,33 @@
  * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
  *
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/cpl.php.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
  *
- * 2) under the terms of the "The BSD License" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/bsd-license.php.
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
  *
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- *    copy of which is available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/gpl-license.php.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
  *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/mutex.h>
@@ -55,7 +60,7 @@ struct addr_req {
 	int status;
 };
 
-static void process_req(void *work);
+static void process_req(struct work_struct *work);
 
 static DEFINE_MUTEX(lock);
 static LIST_HEAD(req_list);
@@ -100,6 +105,7 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
 	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
 	if (dst_dev_addr)
 		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+	dev_addr->src_dev = dev;
 	return 0;
 }
 EXPORT_SYMBOL(rdma_copy_addr);
@@ -110,7 +116,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
 	__be32 ip = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
 	int ret;
 
-	dev = ip_dev_find(ip);
+	dev = ip_dev_find(&init_net, ip);
 	if (!dev)
 		return -EADDRNOTAVAIL;
 
@@ -154,11 +160,11 @@ static void addr_send_arp(struct sockaddr_in *dst_in)
 {
 	struct rtable *rt;
 	struct flowi fl;
-	u32 dst_ip = dst_in->sin_addr.s_addr;
+	__be32 dst_ip = dst_in->sin_addr.s_addr;
 
 	memset(&fl, 0, sizeof fl);
 	fl.nl_u.ip4_u.daddr = dst_ip;
-	if (ip_route_output_key(&rt, &fl))
+	if (ip_route_output_key(&init_net, &rt, &fl))
 		return;
 
 	neigh_event_send(rt->u.dst.neighbour, NULL);
@@ -169,8 +175,8 @@ static int addr_resolve_remote(struct sockaddr_in *src_in,
 			       struct sockaddr_in *dst_in,
 			       struct rdma_dev_addr *addr)
 {
-	u32 src_ip = src_in->sin_addr.s_addr;
-	u32 dst_ip = dst_in->sin_addr.s_addr;
+	__be32 src_ip = src_in->sin_addr.s_addr;
+	__be32 dst_ip = dst_in->sin_addr.s_addr;
 	struct flowi fl;
 	struct rtable *rt;
 	struct neighbour *neigh;
@@ -179,7 +185,7 @@ static int addr_resolve_remote(struct sockaddr_in *src_in,
 	memset(&fl, 0, sizeof fl);
 	fl.nl_u.ip4_u.daddr = dst_ip;
 	fl.nl_u.ip4_u.saddr = src_ip;
-	ret = ip_route_output_key(&rt, &fl);
+	ret = ip_route_output_key(&init_net, &rt, &fl);
 	if (ret)
 		goto out;
 
@@ -214,7 +220,7 @@ out:
 	return ret;
 }
 
-static void process_req(void *work)
+static void process_req(struct work_struct *work)
 {
 	struct addr_req *req, *temp_req;
 	struct sockaddr_in *src_in, *dst_in;
@@ -257,19 +263,19 @@ static int addr_resolve_local(struct sockaddr_in *src_in,
 			      struct rdma_dev_addr *addr)
 {
 	struct net_device *dev;
-	u32 src_ip = src_in->sin_addr.s_addr;
+	__be32 src_ip = src_in->sin_addr.s_addr;
 	__be32 dst_ip = dst_in->sin_addr.s_addr;
 	int ret;
 
-	dev = ip_dev_find(dst_ip);
+	dev = ip_dev_find(&init_net, dst_ip);
 	if (!dev)
 		return -EADDRNOTAVAIL;
 
-	if (ZERONET(src_ip)) {
+	if (ipv4_is_zeronet(src_ip)) {
 		src_in->sin_family = dst_in->sin_family;
 		src_in->sin_addr.s_addr = dst_ip;
 		ret = rdma_copy_addr(addr, dev, dev->dev_addr);
-	} else if (LOOPBACK(src_ip)) {
+	} else if (ipv4_is_loopback(src_ip)) {
 		ret = rdma_translate_ip((struct sockaddr *)dst_in, addr);
 		if (!ret)
 			memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
index fb9ed14..6669287 100644
--- a/drivers/infiniband/core/agent.h
+++ b/drivers/infiniband/core/agent.h
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: agent.h 1389 2004-12-27 22:56:47Z roland $
  */
 
 #ifndef __AGENT_H_
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index e85f701..6888356 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: cache.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/module.h>
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index dc3bc17..73774b4 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2006 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Intel Corporation.  All rights reserved.
  * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
  * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
@@ -31,19 +31,20 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: cm.c 4311 2005-12-05 18:42:01Z sean.hefty $
  */
 
 #include <linux/completion.h>
 #include <linux/dma-mapping.h>
+#include <linux/device.h>
 #include <linux/err.h>
 #include <linux/idr.h>
 #include <linux/interrupt.h>
 #include <linux/random.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
+#include <linux/sysfs.h>
 #include <linux/workqueue.h>
+#include <linux/kdev_t.h>
 
 #include <rdma/ib_cache.h>
 #include <rdma/ib_cm.h>
@@ -89,17 +90,94 @@ static struct ib_cm {
 	struct workqueue_struct *wq;
 } cm;
 
+/* Counter indexes ordered by attribute ID */
+enum {
+	CM_REQ_COUNTER,
+	CM_MRA_COUNTER,
+	CM_REJ_COUNTER,
+	CM_REP_COUNTER,
+	CM_RTU_COUNTER,
+	CM_DREQ_COUNTER,
+	CM_DREP_COUNTER,
+	CM_SIDR_REQ_COUNTER,
+	CM_SIDR_REP_COUNTER,
+	CM_LAP_COUNTER,
+	CM_APR_COUNTER,
+	CM_ATTR_COUNT,
+	CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+	CM_XMIT,
+	CM_XMIT_RETRIES,
+	CM_RECV,
+	CM_RECV_DUPLICATES,
+	CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+				     [sizeof("cm_rx_duplicates")] = {
+	"cm_tx_msgs", "cm_tx_retries",
+	"cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+	struct kobject obj;
+	atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+	struct attribute attr;
+	int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+	.attr = { .name = __stringify(_name), .mode = 0444, .owner = THIS_MODULE }, \
+	.index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+	&cm_req_counter_attr.attr,
+	&cm_mra_counter_attr.attr,
+	&cm_rej_counter_attr.attr,
+	&cm_rep_counter_attr.attr,
+	&cm_rtu_counter_attr.attr,
+	&cm_dreq_counter_attr.attr,
+	&cm_drep_counter_attr.attr,
+	&cm_sidr_req_counter_attr.attr,
+	&cm_sidr_rep_counter_attr.attr,
+	&cm_lap_counter_attr.attr,
+	&cm_apr_counter_attr.attr,
+	NULL
+};
+
 struct cm_port {
 	struct cm_device *cm_dev;
 	struct ib_mad_agent *mad_agent;
+	struct kobject port_obj;
 	u8 port_num;
+	struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
 };
 
 struct cm_device {
 	struct list_head list;
-	struct ib_device *device;
+	struct ib_device *ib_device;
+	struct device *device;
 	u8 ack_delay;
-	struct cm_port port[0];
+	struct cm_port *port[0];
 };
 
 struct cm_av {
@@ -273,7 +351,7 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
 {
 	av->port = port;
 	av->pkey_index = wc->pkey_index;
-	ib_init_ah_from_wc(port->cm_dev->device, port->port_num, wc,
+	ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc,
 			   grh, &av->ah_attr);
 }
 
@@ -287,9 +365,9 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
 
 	read_lock_irqsave(&cm.device_lock, flags);
 	list_for_each_entry(cm_dev, &cm.device_list, list) {
-		if (!ib_find_cached_gid(cm_dev->device, &path->sgid,
+		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
 					&p, NULL)) {
-			port = &cm_dev->port[p-1];
+			port = cm_dev->port[p-1];
 			break;
 		}
 	}
@@ -298,13 +376,13 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
 	if (!port)
 		return -EINVAL;
 
-	ret = ib_find_cached_pkey(cm_dev->device, port->port_num,
+	ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
 				  be16_to_cpu(path->pkey), &av->pkey_index);
 	if (ret)
 		return ret;
 
 	av->port = port;
-	ib_init_ah_from_path(cm_dev->device, port->port_num, path,
+	ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
 			     &av->ah_attr);
 	av->timeout = path->packet_life_time + 1;
 	return 0;
@@ -325,7 +403,7 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv)
 		spin_unlock_irqrestore(&cm.lock, flags);
 	} while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
 
-	cm_id_priv->id.local_id = (__force __be32) (id ^ cm.random_id_operand);
+	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
 	return ret;
 }
 
@@ -399,6 +477,31 @@ static int cm_compare_private_data(u8 *private_data,
 	return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE);
 }
 
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+	return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+	return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+	return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+	return (__force u64) a > (__force u64) b;
+}
+
 static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 {
 	struct rb_node **link = &cm.listen_service_table.rb_node;
@@ -424,9 +527,9 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 			link = &(*link)->rb_left;
 		else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
 			link = &(*link)->rb_right;
-		else if (service_id < cur_cm_id_priv->id.service_id)
+		else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
 			link = &(*link)->rb_left;
-		else if (service_id > cur_cm_id_priv->id.service_id)
+		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
 			link = &(*link)->rb_right;
 		else if (data_cmp < 0)
 			link = &(*link)->rb_left;
@@ -459,9 +562,9 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
 			node = node->rb_left;
 		else if (device > cm_id_priv->id.device)
 			node = node->rb_right;
-		else if (service_id < cm_id_priv->id.service_id)
+		else if (be64_lt(service_id, cm_id_priv->id.service_id))
 			node = node->rb_left;
-		else if (service_id > cm_id_priv->id.service_id)
+		else if (be64_gt(service_id, cm_id_priv->id.service_id))
 			node = node->rb_right;
 		else if (data_cmp < 0)
 			node = node->rb_left;
@@ -484,13 +587,13 @@ static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
 		parent = *link;
 		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
 					     remote_id_node);
-		if (remote_id < cur_timewait_info->work.remote_id)
+		if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
 			link = &(*link)->rb_left;
-		else if (remote_id > cur_timewait_info->work.remote_id)
+		else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
 			link = &(*link)->rb_right;
-		else if (remote_ca_guid < cur_timewait_info->remote_ca_guid)
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
 			link = &(*link)->rb_left;
-		else if (remote_ca_guid > cur_timewait_info->remote_ca_guid)
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
 			link = &(*link)->rb_right;
 		else
 			return cur_timewait_info;
@@ -510,13 +613,13 @@ static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
 	while (node) {
 		timewait_info = rb_entry(node, struct cm_timewait_info,
 					 remote_id_node);
-		if (remote_id < timewait_info->work.remote_id)
+		if (be32_lt(remote_id, timewait_info->work.remote_id))
 			node = node->rb_left;
-		else if (remote_id > timewait_info->work.remote_id)
+		else if (be32_gt(remote_id, timewait_info->work.remote_id))
 			node = node->rb_right;
-		else if (remote_ca_guid < timewait_info->remote_ca_guid)
+		else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
 			node = node->rb_left;
-		else if (remote_ca_guid > timewait_info->remote_ca_guid)
+		else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
 			node = node->rb_right;
 		else
 			return timewait_info;
@@ -537,13 +640,13 @@ static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
 		parent = *link;
 		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
 					     remote_qp_node);
-		if (remote_qpn < cur_timewait_info->remote_qpn)
+		if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
 			link = &(*link)->rb_left;
-		else if (remote_qpn > cur_timewait_info->remote_qpn)
+		else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
 			link = &(*link)->rb_right;
-		else if (remote_ca_guid < cur_timewait_info->remote_ca_guid)
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
 			link = &(*link)->rb_left;
-		else if (remote_ca_guid > cur_timewait_info->remote_ca_guid)
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
 			link = &(*link)->rb_right;
 		else
 			return cur_timewait_info;
@@ -567,9 +670,9 @@ static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
 		parent = *link;
 		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
 					  sidr_id_node);
-		if (remote_id < cur_cm_id_priv->id.remote_id)
+		if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
 			link = &(*link)->rb_left;
-		else if (remote_id > cur_cm_id_priv->id.remote_id)
+		else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
 			link = &(*link)->rb_right;
 		else {
 			int cmp;
@@ -906,6 +1009,9 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 			  struct cm_id_private *cm_id_priv,
 			  struct ib_cm_req_param *param)
 {
+	struct ib_sa_path_rec *pri_path = param->primary_path;
+	struct ib_sa_path_rec *alt_path = param->alternate_path;
+
 	cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
 			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
 
@@ -941,35 +1047,46 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 	cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
 	cm_req_set_srq(req_msg, param->srq);
 
-	req_msg->primary_local_lid = param->primary_path->slid;
-	req_msg->primary_remote_lid = param->primary_path->dlid;
-	req_msg->primary_local_gid = param->primary_path->sgid;
-	req_msg->primary_remote_gid = param->primary_path->dgid;
-	cm_req_set_primary_flow_label(req_msg, param->primary_path->flow_label);
-	cm_req_set_primary_packet_rate(req_msg, param->primary_path->rate);
-	req_msg->primary_traffic_class = param->primary_path->traffic_class;
-	req_msg->primary_hop_limit = param->primary_path->hop_limit;
-	cm_req_set_primary_sl(req_msg, param->primary_path->sl);
-	cm_req_set_primary_subnet_local(req_msg, 1); /* local only... */
+	if (pri_path->hop_limit <= 1) {
+		req_msg->primary_local_lid = pri_path->slid;
+		req_msg->primary_remote_lid = pri_path->dlid;
+	} else {
+		/* Work-around until there's a way to obtain remote LID info */
+		req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+		req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+	}
+	req_msg->primary_local_gid = pri_path->sgid;
+	req_msg->primary_remote_gid = pri_path->dgid;
+	cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+	cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+	req_msg->primary_traffic_class = pri_path->traffic_class;
+	req_msg->primary_hop_limit = pri_path->hop_limit;
+	cm_req_set_primary_sl(req_msg, pri_path->sl);
+	cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
 	cm_req_set_primary_local_ack_timeout(req_msg,
 		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
-			       param->primary_path->packet_life_time));
+			       pri_path->packet_life_time));
 
-	if (param->alternate_path) {
-		req_msg->alt_local_lid = param->alternate_path->slid;
-		req_msg->alt_remote_lid = param->alternate_path->dlid;
-		req_msg->alt_local_gid = param->alternate_path->sgid;
-		req_msg->alt_remote_gid = param->alternate_path->dgid;
+	if (alt_path) {
+		if (alt_path->hop_limit <= 1) {
+			req_msg->alt_local_lid = alt_path->slid;
+			req_msg->alt_remote_lid = alt_path->dlid;
+		} else {
+			req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+			req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+		}
+		req_msg->alt_local_gid = alt_path->sgid;
+		req_msg->alt_remote_gid = alt_path->dgid;
 		cm_req_set_alt_flow_label(req_msg,
-					  param->alternate_path->flow_label);
-		cm_req_set_alt_packet_rate(req_msg, param->alternate_path->rate);
-		req_msg->alt_traffic_class = param->alternate_path->traffic_class;
-		req_msg->alt_hop_limit = param->alternate_path->hop_limit;
-		cm_req_set_alt_sl(req_msg, param->alternate_path->sl);
-		cm_req_set_alt_subnet_local(req_msg, 1); /* local only... */
+					  alt_path->flow_label);
+		cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+		req_msg->alt_traffic_class = alt_path->traffic_class;
+		req_msg->alt_hop_limit = alt_path->hop_limit;
+		cm_req_set_alt_sl(req_msg, alt_path->sl);
+		cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
 		cm_req_set_alt_local_ack_timeout(req_msg,
 			cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
-				       param->alternate_path->packet_life_time));
+				       alt_path->packet_life_time));
 	}
 
 	if (param->private_data && param->private_data_len)
@@ -1298,6 +1415,9 @@ static void cm_dup_req_handler(struct cm_work *work,
 	struct ib_mad_send_buf *msg = NULL;
 	int ret;
 
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REQ_COUNTER]);
+
 	/* Quick state check to discard duplicate REQs. */
 	if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
 		return;
@@ -1387,6 +1507,34 @@ out:
 	return listen_cm_id_priv;
 }
 
+/*
+ * Work-around for inter-subnet connections.  If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+	if (!cm_req_get_primary_subnet_local(req_msg)) {
+		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_primary_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+
+	if (!cm_req_get_alt_subnet_local(req_msg)) {
+		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_alt_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+}
+
 static int cm_req_handler(struct cm_work *work)
 {
 	struct ib_cm_id *cm_id;
@@ -1396,7 +1544,7 @@ static int cm_req_handler(struct cm_work *work)
 
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
 
-	cm_id = ib_create_cm_id(work->port->cm_dev->device, NULL, NULL);
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
 	if (IS_ERR(cm_id))
 		return PTR_ERR(cm_id);
 
@@ -1427,10 +1575,11 @@ static int cm_req_handler(struct cm_work *work)
 	cm_id_priv->id.service_id = req_msg->service_id;
 	cm_id_priv->id.service_mask = __constant_cpu_to_be64(~0ULL);
 
+	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
 	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
 	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
 	if (ret) {
-		ib_get_cached_gid(work->port->cm_dev->device,
+		ib_get_cached_gid(work->port->cm_dev->ib_device,
 				  work->port->port_num, 0, &work->path[0].sgid);
 		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
 			       &work->path[0].sgid, sizeof work->path[0].sgid,
@@ -1651,6 +1800,8 @@ static void cm_dup_rep_handler(struct cm_work *work)
 	if (!cm_id_priv)
 		return;
 
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REP_COUNTER]);
 	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
 	if (ret)
 		goto deref;
@@ -1816,6 +1967,8 @@ static int cm_rtu_handler(struct cm_work *work)
 	if (cm_id_priv->id.state != IB_CM_REP_SENT &&
 	    cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
 		spin_unlock_irq(&cm_id_priv->lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_RTU_COUNTER]);
 		goto out;
 	}
 	cm_id_priv->id.state = IB_CM_ESTABLISHED;
@@ -1993,6 +2146,8 @@ static int cm_dreq_handler(struct cm_work *work)
 	cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
 				   dreq_msg->local_comm_id);
 	if (!cm_id_priv) {
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
 		cm_issue_drep(work->port, work->mad_recv_wc);
 		return -EINVAL;
 	}
@@ -2012,6 +2167,8 @@ static int cm_dreq_handler(struct cm_work *work)
 	case IB_CM_MRA_REP_RCVD:
 		break;
 	case IB_CM_TIMEWAIT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
 		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
 			goto unlock;
 
@@ -2023,6 +2180,10 @@ static int cm_dreq_handler(struct cm_work *work)
 		if (ib_post_send_mad(msg, NULL))
 			cm_free_msg(msg);
 		goto deref;
+	case IB_CM_DREQ_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		goto unlock;
 	default:
 		goto unlock;
 	}
@@ -2380,10 +2541,20 @@ static int cm_mra_handler(struct cm_work *work)
 		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
 		    cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
 		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
-				  cm_id_priv->msg, timeout))
+				  cm_id_priv->msg, timeout)) {
+			if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+				atomic_long_inc(&work->port->
+						counter_group[CM_RECV_DUPLICATES].
+						counter[CM_MRA_COUNTER]);
 			goto out;
+		}
 		cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
 		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_MRA_REP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_MRA_COUNTER]);
+		/* fall through */
 	default:
 		goto out;
 	}
@@ -2543,6 +2714,8 @@ static int cm_lap_handler(struct cm_work *work)
 	case IB_CM_LAP_IDLE:
 		break;
 	case IB_CM_MRA_LAP_SENT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
 		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
 			goto unlock;
 
@@ -2556,6 +2729,10 @@ static int cm_lap_handler(struct cm_work *work)
 		if (ib_post_send_mad(msg, NULL))
 			cm_free_msg(msg);
 		goto deref;
+	case IB_CM_LAP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		goto unlock;
 	default:
 		goto unlock;
 	}
@@ -2736,7 +2913,7 @@ static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
 	cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
 			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
 	sidr_req_msg->request_id = cm_id_priv->id.local_id;
-	sidr_req_msg->pkey = cpu_to_be16(param->path->pkey);
+	sidr_req_msg->pkey = param->path->pkey;
 	sidr_req_msg->service_id = param->service_id;
 
 	if (param->private_data && param->private_data_len)
@@ -2821,7 +2998,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	struct cm_sidr_req_msg *sidr_req_msg;
 	struct ib_wc *wc;
 
-	cm_id = ib_create_cm_id(work->port->cm_dev->device, NULL, NULL);
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
 	if (IS_ERR(cm_id))
 		return PTR_ERR(cm_id);
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -2843,6 +3020,8 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
 	if (cur_cm_id_priv) {
 		spin_unlock_irq(&cm.lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_SIDR_REQ_COUNTER]);
 		goto out; /* Duplicate message. */
 	}
 	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
@@ -3037,6 +3216,27 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
 			    struct ib_mad_send_wc *mad_send_wc)
 {
 	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+	struct cm_port *port;
+	u16 attr_index;
+
+	port = mad_agent->context;
+	attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+				  msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+	/*
+	 * If the send was in response to a received message (context[0] is not
+	 * set to a cm_id), and is not a REJ, then it is a send that was
+	 * manually retried.
+	 */
+	if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+		msg->retries = 1;
+
+	atomic_long_add(1 + msg->retries,
+			&port->counter_group[CM_XMIT].counter[attr_index]);
+	if (msg->retries)
+		atomic_long_add(msg->retries,
+				&port->counter_group[CM_XMIT_RETRIES].
+				counter[attr_index]);
 
 	switch (mad_send_wc->status) {
 	case IB_WC_SUCCESS:
@@ -3195,8 +3395,10 @@ EXPORT_SYMBOL(ib_cm_notify);
 static void cm_recv_handler(struct ib_mad_agent *mad_agent,
 			    struct ib_mad_recv_wc *mad_recv_wc)
 {
+	struct cm_port *port = mad_agent->context;
 	struct cm_work *work;
 	enum ib_cm_event_type event;
+	u16 attr_id;
 	int paths = 0;
 
 	switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
@@ -3241,6 +3443,10 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
 		return;
 	}
 
+	attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+	atomic_long_inc(&port->counter_group[CM_RECV].
+			counter[attr_id - CM_ATTR_ID_OFFSET]);
+
 	work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths,
 		       GFP_KERNEL);
 	if (!work) {
@@ -3251,7 +3457,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
 	INIT_DELAYED_WORK(&work->work, cm_work_handler);
 	work->cm_event.event = event;
 	work->mad_recv_wc = mad_recv_wc;
-	work->port = (struct cm_port *)mad_agent->context;
+	work->port = port;
 	queue_delayed_work(cm.wq, &work->work, 0);
 }
 
@@ -3420,13 +3626,93 @@ static void cm_get_ack_delay(struct cm_device *cm_dev)
 {
 	struct ib_device_attr attr;
 
-	if (ib_query_device(cm_dev->device, &attr))
+	if (ib_query_device(cm_dev->ib_device, &attr))
 		cm_dev->ack_delay = 0; /* acks will rely on packet life time */
 	else
 		cm_dev->ack_delay = attr.local_ca_ack_delay;
 }
 
-static void cm_add_one(struct ib_device *device)
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+			       char *buf)
+{
+	struct cm_counter_group *group;
+	struct cm_counter_attribute *cm_attr;
+
+	group = container_of(obj, struct cm_counter_group, obj);
+	cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+	return sprintf(buf, "%ld\n",
+		       atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static struct sysfs_ops cm_counter_ops = {
+	.show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+	.sysfs_ops = &cm_counter_ops,
+	.default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+	struct cm_port *cm_port;
+
+	cm_port = container_of(obj, struct cm_port, port_obj);
+	kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+	.release = cm_release_port_obj
+};
+
+struct class cm_class = {
+	.name    = "infiniband_cm",
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+	int i, ret;
+
+	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+				   &port->cm_dev->device->kobj,
+				   "%d", port->port_num);
+	if (ret) {
+		kfree(port);
+		return ret;
+	}
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+		ret = kobject_init_and_add(&port->counter_group[i].obj,
+					   &cm_counter_obj_type,
+					   &port->port_obj,
+					   "%s", counter_group_names[i]);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	while (i--)
+		kobject_unregister(&port->counter_group[i].obj);
+	kobject_unregister(&port->port_obj);
+	return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+	int i;
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++)
+		kobject_unregister(&port->counter_group[i].obj);
+
+	kobject_unregister(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
 {
 	struct cm_device *cm_dev;
 	struct cm_port *port;
@@ -3441,23 +3727,40 @@ static void cm_add_one(struct ib_device *device)
 	int ret;
 	u8 i;
 
-	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+	if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
 		return;
 
-	cm_dev = kmalloc(sizeof(*cm_dev) + sizeof(*port) *
-			 device->phys_port_cnt, GFP_KERNEL);
+	cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
+			 ib_device->phys_port_cnt, GFP_KERNEL);
 	if (!cm_dev)
 		return;
 
-	cm_dev->device = device;
+	cm_dev->ib_device = ib_device;
 	cm_get_ack_delay(cm_dev);
 
+	cm_dev->device = device_create(&cm_class, ib_device->class_dev.dev,
+					       MKDEV(0, 0),
+					       "%s", ib_device->name);
+	if (!cm_dev->device) {
+		kfree(cm_dev);
+		return;
+	}
+
 	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
-	for (i = 1; i <= device->phys_port_cnt; i++) {
-		port = &cm_dev->port[i-1];
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = kzalloc(sizeof *port, GFP_KERNEL);
+		if (!port)
+			goto error1;
+
+		cm_dev->port[i-1] = port;
 		port->cm_dev = cm_dev;
 		port->port_num = i;
-		port->mad_agent = ib_register_mad_agent(device, i,
+
+		ret = cm_create_port_fs(port);
+		if (ret)
+			goto error1;
+
+		port->mad_agent = ib_register_mad_agent(ib_device, i,
 							IB_QPT_GSI,
 							&reg_req,
 							0,
@@ -3465,33 +3768,37 @@ static void cm_add_one(struct ib_device *device)
 							cm_recv_handler,
 							port);
 		if (IS_ERR(port->mad_agent))
-			goto error1;
+			goto error2;
 
-		ret = ib_modify_port(device, i, 0, &port_modify);
+		ret = ib_modify_port(ib_device, i, 0, &port_modify);
 		if (ret)
-			goto error2;
+			goto error3;
 	}
-	ib_set_client_data(device, &cm_client, cm_dev);
+	ib_set_client_data(ib_device, &cm_client, cm_dev);
 
 	write_lock_irqsave(&cm.device_lock, flags);
 	list_add_tail(&cm_dev->list, &cm.device_list);
 	write_unlock_irqrestore(&cm.device_lock, flags);
 	return;
 
-error2:
+error3:
 	ib_unregister_mad_agent(port->mad_agent);
+error2:
+	cm_remove_port_fs(port);
 error1:
 	port_modify.set_port_cap_mask = 0;
 	port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
 	while (--i) {
-		port = &cm_dev->port[i-1];
-		ib_modify_port(device, port->port_num, 0, &port_modify);
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
 		ib_unregister_mad_agent(port->mad_agent);
+		cm_remove_port_fs(port);
 	}
+	device_unregister(cm_dev->device);
 	kfree(cm_dev);
 }
 
-static void cm_remove_one(struct ib_device *device)
+static void cm_remove_one(struct ib_device *ib_device)
 {
 	struct cm_device *cm_dev;
 	struct cm_port *port;
@@ -3501,7 +3808,7 @@ static void cm_remove_one(struct ib_device *device)
 	unsigned long flags;
 	int i;
 
-	cm_dev = ib_get_client_data(device, &cm_client);
+	cm_dev = ib_get_client_data(ib_device, &cm_client);
 	if (!cm_dev)
 		return;
 
@@ -3509,12 +3816,14 @@ static void cm_remove_one(struct ib_device *device)
 	list_del(&cm_dev->list);
 	write_unlock_irqrestore(&cm.device_lock, flags);
 
-	for (i = 1; i <= device->phys_port_cnt; i++) {
-		port = &cm_dev->port[i-1];
-		ib_modify_port(device, port->port_num, 0, &port_modify);
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
 		ib_unregister_mad_agent(port->mad_agent);
 		flush_workqueue(cm.wq);
+		cm_remove_port_fs(port);
 	}
+	device_unregister(cm_dev->device);
 	kfree(cm_dev);
 }
 
@@ -3536,17 +3845,25 @@ static int __init ib_cm_init(void)
 	idr_pre_get(&cm.local_id_table, GFP_KERNEL);
 	INIT_LIST_HEAD(&cm.timewait_list);
 
-	cm.wq = create_workqueue("ib_cm");
-	if (!cm.wq)
+	ret = class_register(&cm_class);
+	if (ret)
 		return -ENOMEM;
 
+	cm.wq = create_workqueue("ib_cm");
+	if (!cm.wq) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
 	ret = ib_register_client(&cm_client);
 	if (ret)
-		goto error;
+		goto error2;
 
 	return 0;
-error:
+error2:
 	destroy_workqueue(cm.wq);
+error1:
+	class_unregister(&cm_class);
 	return ret;
 }
 
@@ -3567,6 +3884,7 @@ static void __exit ib_cm_cleanup(void)
 		kfree(timewait_info);
 	}
 
+	class_unregister(&cm_class);
 	idr_destroy(&cm.local_id_table);
 }
 
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 19f75d2..faef87b 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4,29 +4,33 @@
  * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
  * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
  *
- * This Software is licensed under one of the following licenses:
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
  *
- * 1) under the terms of the "Common Public License 1.0" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/cpl.php.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
  *
- * 2) under the terms of the "The BSD License" a copy of which is
- *    available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/bsd-license.php.
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
  *
- * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
- *    copy of which is available from the Open Source Initiative, see
- *    http://www.opensource.org/licenses/gpl-license.php.
- *
- * Licensee has the right to choose one of the above licenses.
- *
- * Redistributions of source code must retain the above copyright
- * notice and one of the license notices.
- *
- * Redistributions in binary form must reproduce both the above copyright
- * notice, one of the license notices in the documentation
- * and/or other materials provided with the distribution.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
  *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
 #include <linux/completion.h>
@@ -54,6 +58,11 @@ static int tavor_quirk = 0;
 module_param_named(tavor_quirk, tavor_quirk, int, 0644);
 MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0");
 
+int unify_tcp_port_space = 0;
+module_param(unify_tcp_port_space, int, 0644);
+MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
+		 "space allocation (default=0)");
+
 #define CMA_CM_RESPONSE_TIMEOUT 20
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
@@ -121,6 +130,7 @@ struct rdma_id_private {
 	struct rdma_cm_id	id;
 
 	struct rdma_bind_list	*bind_list;
+	struct socket		*sock;
 	struct hlist_node	node;
 	struct list_head	list; /* listen_any_list or cma_device.list */
 	struct list_head	listen_list; /* per device listens */
@@ -134,8 +144,7 @@ struct rdma_id_private {
 
 	struct completion	comp;
 	atomic_t		refcount;
-	wait_queue_head_t	wait_remove;
-	atomic_t		dev_remove;
+	struct mutex		handler_mutex;
 
 	int			backlog;
 	int			timeout_ms;
@@ -160,9 +169,7 @@ struct cma_multicast {
 	} multicast;
 	struct list_head	list;
 	void			*context;
-	struct sockaddr		addr;
-	u8			pad[sizeof(struct sockaddr_in6) -
-				    sizeof(struct sockaddr)];
+	struct sockaddr_storage	addr;
 };
 
 struct cma_work {
@@ -173,18 +180,24 @@ struct cma_work {
 	struct rdma_cm_event	event;
 };
 
+struct cma_ndev_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	struct rdma_cm_event	event;
+};
+
 union cma_ip_addr {
 	struct in6_addr ip6;
 	struct {
-		__u32 pad[3];
-		__u32 addr;
+		__be32 pad[3];
+		__be32 addr;
 	} ip4;
 };
 
 struct cma_hdr {
 	u8 cma_version;
 	u8 ip_version;	/* IP version: 7:4 */
-	__u16 port;
+	__be16 port;
 	union cma_ip_addr src_addr;
 	union cma_ip_addr dst_addr;
 };
@@ -194,8 +207,8 @@ struct sdp_hh {
 	u8 sdp_version; /* Major version: 7:4 */
 	u8 ip_version;	/* IP version: 7:4 */
 	u8 sdp_specific1[10];
-	__u16 port;
-	__u16 sdp_specific2;
+	__be16 port;
+	__be16 sdp_specific2;
 	union cma_ip_addr src_addr;
 	union cma_ip_addr dst_addr;
 };
@@ -297,21 +310,25 @@ static void cma_detach_from_dev(struct rdma_id_private *id_priv)
 	id_priv->cma_dev = NULL;
 }
 
-static int cma_set_qkey(struct ib_device *device, u8 port_num,
-			enum rdma_port_space ps,
-			struct rdma_dev_addr *dev_addr, u32 *qkey)
+static int cma_set_qkey(struct rdma_id_private *id_priv)
 {
 	struct ib_sa_mcmember_rec rec;
 	int ret = 0;
 
-	switch (ps) {
+	if (id_priv->qkey)
+		return 0;
+
+	switch (id_priv->id.ps) {
 	case RDMA_PS_UDP:
-		*qkey = RDMA_UDP_QKEY;
+		id_priv->qkey = RDMA_UDP_QKEY;
 		break;
 	case RDMA_PS_IPOIB:
-		ib_addr_get_mgid(dev_addr, &rec.mgid);
-		ret = ib_sa_get_mcmember_rec(device, port_num, &rec.mgid, &rec);
-		*qkey = be32_to_cpu(rec.qkey);
+		ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+		ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+					     id_priv->id.port_num, &rec.mgid,
+					     &rec);
+		if (!ret)
+			id_priv->qkey = be32_to_cpu(rec.qkey);
 		break;
 	default:
 		break;
@@ -341,12 +358,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv)
 		ret = ib_find_cached_gid(cma_dev->device, &gid,
 					 &id_priv->id.port_num, NULL);
 		if (!ret) {
-			ret = cma_set_qkey(cma_dev->device,
-					   id_priv->id.port_num,
-					   id_priv->id.ps, dev_addr,
-					   &id_priv->qkey);
-			if (!ret)
-				cma_attach_to_dev(id_priv, cma_dev);
+			cma_attach_to_dev(id_priv, cma_dev);
 			break;
 		}
 	}
@@ -359,26 +371,15 @@ static void cma_deref_id(struct rdma_id_private *id_priv)
 		complete(&id_priv->comp);
 }
 
-static int cma_disable_remove(struct rdma_id_private *id_priv,
+static int cma_disable_callback(struct rdma_id_private *id_priv,
 			      enum cma_state state)
 {
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&id_priv->lock, flags);
-	if (id_priv->state == state) {
-		atomic_inc(&id_priv->dev_remove);
-		ret = 0;
-	} else
-		ret = -EINVAL;
-	spin_unlock_irqrestore(&id_priv->lock, flags);
-	return ret;
-}
-
-static void cma_enable_remove(struct rdma_id_private *id_priv)
-{
-	if (atomic_dec_and_test(&id_priv->dev_remove))
-		wake_up(&id_priv->wait_remove);
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != state) {
+		mutex_unlock(&id_priv->handler_mutex);
+		return -EINVAL;
+	}
+	return 0;
 }
 
 static int cma_has_cm_dev(struct rdma_id_private *id_priv)
@@ -403,8 +404,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
 	mutex_init(&id_priv->qp_mutex);
 	init_completion(&id_priv->comp);
 	atomic_set(&id_priv->refcount, 1);
-	init_waitqueue_head(&id_priv->wait_remove);
-	atomic_set(&id_priv->dev_remove, 0);
+	mutex_init(&id_priv->handler_mutex);
 	INIT_LIST_HEAD(&id_priv->listen_list);
 	INIT_LIST_HEAD(&id_priv->mc_list);
 	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
@@ -496,7 +496,8 @@ void rdma_destroy_qp(struct rdma_cm_id *id)
 }
 EXPORT_SYMBOL(rdma_destroy_qp);
 
-static int cma_modify_qp_rtr(struct rdma_id_private *id_priv)
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
 {
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
@@ -522,13 +523,16 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv)
 	if (ret)
 		goto out;
 
+	if (conn_param)
+		qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
 	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
 out:
 	mutex_unlock(&id_priv->qp_mutex);
 	return ret;
 }
 
-static int cma_modify_qp_rts(struct rdma_id_private *id_priv)
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
 {
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
@@ -544,6 +548,8 @@ static int cma_modify_qp_rts(struct rdma_id_private *id_priv)
 	if (ret)
 		goto out;
 
+	if (conn_param)
+		qp_attr.max_rd_atomic = conn_param->initiator_depth;
 	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
 out:
 	mutex_unlock(&id_priv->qp_mutex);
@@ -584,6 +590,10 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
 	*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
 
 	if (cma_is_ud_ps(id_priv->id.ps)) {
+		ret = cma_set_qkey(id_priv);
+		if (ret)
+			return ret;
+
 		qp_attr->qkey = id_priv->qkey;
 		*qp_attr_mask |= IB_QP_QKEY;
 	} else {
@@ -632,7 +642,8 @@ static inline int cma_zero_addr(struct sockaddr *addr)
 	struct in6_addr *ip6;
 
 	if (addr->sa_family == AF_INET)
-		return ZERONET(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+		return ipv4_is_zeronet(
+			((struct sockaddr_in *)addr)->sin_addr.s_addr);
 	else {
 		ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
 		return (ip6->s6_addr32[0] | ip6->s6_addr32[1] |
@@ -642,7 +653,7 @@ static inline int cma_zero_addr(struct sockaddr *addr)
 
 static inline int cma_loopback_addr(struct sockaddr *addr)
 {
-	return LOOPBACK(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+	return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr);
 }
 
 static inline int cma_any_addr(struct sockaddr *addr)
@@ -664,7 +675,7 @@ static inline int cma_any_port(struct sockaddr *addr)
 }
 
 static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
-			    u8 *ip_ver, __u16 *port,
+			    u8 *ip_ver, __be16 *port,
 			    union cma_ip_addr **src, union cma_ip_addr **dst)
 {
 	switch (ps) {
@@ -696,7 +707,7 @@ static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
 
 static void cma_save_net_info(struct rdma_addr *addr,
 			      struct rdma_addr *listen_addr,
-			      u8 ip_ver, __u16 port,
+			      u8 ip_ver, __be16 port,
 			      union cma_ip_addr *src, union cma_ip_addr *dst)
 {
 	struct sockaddr_in *listen4, *ip4;
@@ -790,8 +801,8 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv,
 		cma_cancel_route(id_priv);
 		break;
 	case CMA_LISTEN:
-		if (cma_any_addr(&id_priv->id.route.addr.src_addr) &&
-		    !id_priv->cma_dev)
+		if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)
+				&& !id_priv->cma_dev)
 			cma_cancel_listens(id_priv);
 		break;
 	default:
@@ -813,6 +824,8 @@ static void cma_release_port(struct rdma_id_private *id_priv)
 		kfree(bind_list);
 	}
 	mutex_unlock(&lock);
+	if (id_priv->sock)
+		sock_release(id_priv->sock);
 }
 
 static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
@@ -874,11 +887,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
 {
 	int ret;
 
-	ret = cma_modify_qp_rtr(id_priv);
+	ret = cma_modify_qp_rtr(id_priv, NULL);
 	if (ret)
 		goto reject;
 
-	ret = cma_modify_qp_rts(id_priv);
+	ret = cma_modify_qp_rts(id_priv, NULL);
 	if (ret)
 		goto reject;
 
@@ -924,7 +937,10 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	struct rdma_cm_event event;
 	int ret = 0;
 
-	if (cma_disable_remove(id_priv, CMA_CONNECT))
+	if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, CMA_CONNECT)) ||
+	    (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, CMA_DISCONNECT)))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -960,6 +976,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		event.event = RDMA_CM_EVENT_DISCONNECTED;
 		break;
 	case IB_CM_TIMEWAIT_EXIT:
+		event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+		break;
 	case IB_CM_MRA_RECEIVED:
 		/* ignore event */
 		goto out;
@@ -971,7 +989,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
 		break;
 	default:
-		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d",
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
 		       ib_event->event);
 		goto out;
 	}
@@ -981,12 +999,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.ib = NULL;
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
 	}
 out:
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	return ret;
 }
 
@@ -997,8 +1015,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 	struct rdma_cm_id *id;
 	struct rdma_route *rt;
 	union cma_ip_addr *src, *dst;
-	__u16 port;
+	__be16 port;
 	u8 ip_ver;
+	int ret;
 
 	if (cma_get_net_info(ib_event->private_data, listen_id->ps,
 			     &ip_ver, &port, &src, &dst))
@@ -1023,10 +1042,11 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 	if (rt->num_paths == 2)
 		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
 
-	ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
 	ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
-	ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
-	rt->addr.dev_addr.dev_type = RDMA_NODE_IB_CA;
+	ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr,
+				&id->route.addr.dev_addr);
+	if (ret)
+		goto destroy_id;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
 	id_priv->state = CMA_CONNECT;
@@ -1044,7 +1064,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
 	struct rdma_id_private *id_priv;
 	struct rdma_cm_id *id;
 	union cma_ip_addr *src, *dst;
-	__u16 port;
+	__be16 port;
 	u8 ip_ver;
 	int ret;
 
@@ -1061,7 +1081,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
 	cma_save_net_info(&id->route.addr, &listen_id->route.addr,
 			  ip_ver, port, src, dst);
 
-	ret = rdma_translate_ip(&id->route.addr.src_addr,
+	ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr,
 				&id->route.addr.dev_addr);
 	if (ret)
 		goto err;
@@ -1096,7 +1116,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	int offset, ret;
 
 	listen_id = cm_id->context;
-	if (cma_disable_remove(listen_id, CMA_LISTEN))
+	if (cma_disable_callback(listen_id, CMA_LISTEN))
 		return -ECONNABORTED;
 
 	memset(&event, 0, sizeof event);
@@ -1117,7 +1137,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		goto out;
 	}
 
-	atomic_inc(&conn_id->dev_remove);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
 	mutex_lock(&lock);
 	ret = cma_acquire_dev(conn_id);
 	mutex_unlock(&lock);
@@ -1139,7 +1159,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		    !cma_is_ud_ps(conn_id->id.ps))
 			ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
 		mutex_unlock(&lock);
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		goto out;
 	}
 
@@ -1148,11 +1168,11 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 
 release_conn_id:
 	cma_exch(conn_id, CMA_DESTROYING);
-	cma_enable_remove(conn_id);
+	mutex_unlock(&conn_id->handler_mutex);
 	rdma_destroy_id(&conn_id->id);
 
 out:
-	cma_enable_remove(listen_id);
+	mutex_unlock(&listen_id->handler_mutex);
 	return ret;
 }
 
@@ -1166,7 +1186,7 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
 {
 	struct cma_hdr *cma_data, *cma_mask;
 	struct sdp_hh *sdp_data, *sdp_mask;
-	__u32 ip4_addr;
+	__be32 ip4_addr;
 	struct in6_addr ip6_addr;
 
 	memset(compare, 0, sizeof *compare);
@@ -1182,12 +1202,12 @@ static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
 			sdp_set_ip_ver(sdp_data, 4);
 			sdp_set_ip_ver(sdp_mask, 0xF);
 			sdp_data->dst_addr.ip4.addr = ip4_addr;
-			sdp_mask->dst_addr.ip4.addr = ~0;
+			sdp_mask->dst_addr.ip4.addr = htonl(~0);
 		} else {
 			cma_set_ip_ver(cma_data, 4);
 			cma_set_ip_ver(cma_mask, 0xF);
 			cma_data->dst_addr.ip4.addr = ip4_addr;
-			cma_mask->dst_addr.ip4.addr = ~0;
+			cma_mask->dst_addr.ip4.addr = htonl(~0);
 		}
 		break;
 	case AF_INET6:
@@ -1218,7 +1238,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 	struct sockaddr_in *sin;
 	int ret = 0;
 
-	if (cma_disable_remove(id_priv, CMA_CONNECT))
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -1262,12 +1282,12 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.iw = NULL;
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
 	}
 
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	return ret;
 }
 
@@ -1283,31 +1303,31 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	struct ib_device_attr attr;
 
 	listen_id = cm_id->context;
-	if (cma_disable_remove(listen_id, CMA_LISTEN))
+	if (cma_disable_callback(listen_id, CMA_LISTEN))
 		return -ECONNABORTED;
 
 	/* Create a new RDMA id for the new IW CM ID */
 	new_cm_id = rdma_create_id(listen_id->id.event_handler,
 				   listen_id->id.context,
 				   RDMA_PS_TCP);
-	if (!new_cm_id) {
+	if (IS_ERR(new_cm_id)) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
-	atomic_inc(&conn_id->dev_remove);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
 	conn_id->state = CMA_CONNECT;
 
-	dev = ip_dev_find(iw_event->local_addr.sin_addr.s_addr);
+	dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr);
 	if (!dev) {
 		ret = -EADDRNOTAVAIL;
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		goto out;
 	}
 	ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
 	if (ret) {
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		goto out;
 	}
@@ -1316,7 +1336,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	ret = cma_acquire_dev(conn_id);
 	mutex_unlock(&lock);
 	if (ret) {
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		goto out;
 	}
@@ -1332,30 +1352,33 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 
 	ret = ib_query_device(conn_id->id.device, &attr);
 	if (ret) {
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		goto out;
 	}
-	
+
 	memset(&event, 0, sizeof event);
 	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
 	event.param.conn.private_data = iw_event->private_data;
 	event.param.conn.private_data_len = iw_event->private_data_len;
-	event.param.conn.initiator_depth = attr.max_qp_init_rd_atom;	
+	event.param.conn.initiator_depth = attr.max_qp_init_rd_atom;
 	event.param.conn.responder_resources = attr.max_qp_rd_atom;
 	ret = conn_id->id.event_handler(&conn_id->id, &event);
 	if (ret) {
 		/* User wants to destroy the CM ID */
 		conn_id->cm_id.iw = NULL;
 		cma_exch(conn_id, CMA_DESTROYING);
-		cma_enable_remove(conn_id);
+		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(&conn_id->id);
+		goto out;
 	}
 
+	mutex_unlock(&conn_id->handler_mutex);
+
 out:
 	if (dev)
 		dev_put(dev);
-	cma_enable_remove(listen_id);
+	mutex_unlock(&listen_id->handler_mutex);
 	return ret;
 }
 
@@ -1371,7 +1394,7 @@ static int cma_ib_listen(struct rdma_id_private *id_priv)
 	if (IS_ERR(id_priv->cm_id.ib))
 		return PTR_ERR(id_priv->cm_id.ib);
 
-	addr = &id_priv->id.route.addr.src_addr;
+	addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
 	svc_id = cma_get_service_id(id_priv->id.ps, addr);
 	if (cma_any_addr(addr))
 		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
@@ -1437,7 +1460,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
 
 	dev_id_priv->state = CMA_ADDR_BOUND;
 	memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
-	       ip_addr_size(&id_priv->id.route.addr.src_addr));
+	       ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
 
 	cma_attach_to_dev(dev_id_priv, cma_dev);
 	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
@@ -1447,7 +1470,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
 	ret = rdma_listen(id, id_priv->backlog);
 	if (ret)
 		printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
-		       "listening on device %s", ret, cma_dev->device->name);
+		       "listening on device %s\n", ret, cma_dev->device->name);
 }
 
 static void cma_listen_on_all(struct rdma_id_private *id_priv)
@@ -1557,13 +1580,14 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
 	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr));
 	path_rec.numb_path = 1;
 	path_rec.reversible = 1;
-	path_rec.service_id = cma_get_service_id(id_priv->id.ps, &addr->dst_addr);
+	path_rec.service_id = cma_get_service_id(id_priv->id.ps,
+							(struct sockaddr *) &addr->dst_addr);
 
 	comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
 		    IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
 		    IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
 
-	if (addr->src_addr.sa_family == AF_INET) {
+	if (addr->src_addr.ss_family == AF_INET) {
 		path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
 		comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
 	} else {
@@ -1592,7 +1616,7 @@ static void cma_work_handler(struct work_struct *_work)
 	struct rdma_id_private *id_priv = work->id;
 	int destroy = 0;
 
-	atomic_inc(&id_priv->dev_remove);
+	mutex_lock(&id_priv->handler_mutex);
 	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
 		goto out;
 
@@ -1601,7 +1625,31 @@ static void cma_work_handler(struct work_struct *_work)
 		destroy = 1;
 	}
 out:
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+	struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state == CMA_DESTROYING ||
+	    id_priv->state == CMA_DEVICE_REMOVAL)
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		destroy = 1;
+	}
+
+out:
+	mutex_unlock(&id_priv->handler_mutex);
 	cma_deref_id(id_priv);
 	if (destroy)
 		rdma_destroy_id(&id_priv->id);
@@ -1764,7 +1812,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 	struct rdma_cm_event event;
 
 	memset(&event, 0, sizeof event);
-	atomic_inc(&id_priv->dev_remove);
+	mutex_lock(&id_priv->handler_mutex);
 
 	/*
 	 * Grab mutex to block rdma_destroy_id() from removing the device while
@@ -1793,13 +1841,13 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 
 	if (id_priv->id.event_handler(&id_priv->id, &event)) {
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		cma_deref_id(id_priv);
 		rdma_destroy_id(&id_priv->id);
 		return;
 	}
 out:
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	cma_deref_id(id_priv);
 }
 
@@ -1823,7 +1871,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv)
 	ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
 	ib_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
 
-	if (cma_zero_addr(&id_priv->id.route.addr.src_addr)) {
+	if (cma_zero_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) {
 		src_in = (struct sockaddr_in *)&id_priv->id.route.addr.src_addr;
 		dst_in = (struct sockaddr_in *)&id_priv->id.route.addr.dst_addr;
 		src_in->sin_family = dst_in->sin_family;
@@ -1872,7 +1920,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 	if (cma_any_addr(dst_addr))
 		ret = cma_resolve_loopback(id_priv);
 	else
-		ret = rdma_resolve_ip(&addr_client, &id->route.addr.src_addr,
+		ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr,
 				      dst_addr, &id->route.addr.dev_addr,
 				      timeout_ms, addr_handler, id_priv);
 	if (ret)
@@ -1996,11 +2044,11 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
 	 * We don't support binding to any address if anyone is bound to
 	 * a specific address on the same port.
 	 */
-	if (cma_any_addr(&id_priv->id.route.addr.src_addr))
+	if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr))
 		return -EADDRNOTAVAIL;
 
 	hlist_for_each_entry(cur_id, node, &bind_list->owners, node) {
-		if (cma_any_addr(&cur_id->id.route.addr.src_addr))
+		if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr))
 			return -EADDRNOTAVAIL;
 
 		cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr;
@@ -2012,6 +2060,34 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
 	return 0;
 }
 
+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+	int ret;
+	int size;
+	struct socket *sock;
+
+	ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret)
+		return ret;
+	ret = sock->ops->bind(sock,
+			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
+			ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+	if (ret) {
+		sock_release(sock);
+		return ret;
+	}
+	size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
+	ret = sock->ops->getname(sock,
+			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
+			&size, 0);
+	if (ret) {
+		sock_release(sock);
+		return ret;
+	}
+	id_priv->sock = sock;
+	return 0;
+}
+
 static int cma_get_port(struct rdma_id_private *id_priv)
 {
 	struct idr *ps;
@@ -2023,6 +2099,11 @@ static int cma_get_port(struct rdma_id_private *id_priv)
 		break;
 	case RDMA_PS_TCP:
 		ps = &tcp_ps;
+		if (unify_tcp_port_space) {
+			ret = cma_get_tcp_port(id_priv);
+			if (ret)
+				goto out;
+		}
 		break;
 	case RDMA_PS_UDP:
 		ps = &udp_ps;
@@ -2035,12 +2116,12 @@ static int cma_get_port(struct rdma_id_private *id_priv)
 	}
 
 	mutex_lock(&lock);
-	if (cma_any_port(&id_priv->id.route.addr.src_addr))
+	if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr))
 		ret = cma_alloc_any_port(ps, id_priv);
 	else
 		ret = cma_use_port(ps, id_priv);
 	mutex_unlock(&lock);
-
+out:
 	return ret;
 }
 
@@ -2126,7 +2207,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
 	int ret = 0;
 
-	if (cma_disable_remove(id_priv, CMA_CONNECT))
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
 		return 0;
 
 	memset(&event, 0, sizeof event);
@@ -2143,6 +2224,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 			event.status = ib_event->param.sidr_rep_rcvd.status;
 			break;
 		}
+		ret = cma_set_qkey(id_priv);
+		if (ret) {
+			event.event = RDMA_CM_EVENT_ADDR_ERROR;
+			event.status = -EINVAL;
+			break;
+		}
 		if (id_priv->qkey != rep->qkey) {
 			event.event = RDMA_CM_EVENT_UNREACHABLE;
 			event.status = -EINVAL;
@@ -2157,7 +2244,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 		event.status = 0;
 		break;
 	default:
-		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d",
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
 		       ib_event->event);
 		goto out;
 	}
@@ -2167,12 +2254,12 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 		/* Destroy the CM ID by returning a non-zero value. */
 		id_priv->cm_id.ib = NULL;
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return ret;
 	}
 out:
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	return ret;
 }
 
@@ -2207,8 +2294,8 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
 
 	req.path = route->path_rec;
 	req.service_id = cma_get_service_id(id_priv->id.ps,
-					    &route->addr.dst_addr);
-       req.timeout_ms = 1 << (cma_response_timeout - 8);
+					    (struct sockaddr *) &route->addr.dst_addr);
+	req.timeout_ms = 1 << (cma_response_timeout - 8);
 	req.max_cm_retries = CMA_MAX_CM_RETRIES;
 
 	ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
@@ -2258,7 +2345,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
 		req.alternate_path = &route->path_rec[1];
 
 	req.service_id = cma_get_service_id(id_priv->id.ps,
-					    &route->addr.dst_addr);
+					    (struct sockaddr *) &route->addr.dst_addr);
 	req.qp_num = id_priv->qp_num;
 	req.qp_type = IB_QPT_RC;
 	req.starting_psn = id_priv->seq_num;
@@ -2305,7 +2392,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
 	sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
 	cm_id->remote_addr = *sin;
 
-	ret = cma_modify_qp_rtr(id_priv);
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
 	if (ret)
 		goto out;
 
@@ -2368,25 +2455,15 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
 			 struct rdma_conn_param *conn_param)
 {
 	struct ib_cm_rep_param rep;
-	struct ib_qp_attr qp_attr;
-	int qp_attr_mask, ret;
-
-	if (id_priv->id.qp) {
-		ret = cma_modify_qp_rtr(id_priv);
-		if (ret)
-			goto out;
+	int ret;
 
-		qp_attr.qp_state = IB_QPS_RTS;
-		ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, &qp_attr,
-					 &qp_attr_mask);
-		if (ret)
-			goto out;
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
 
-		qp_attr.max_rd_atomic = conn_param->initiator_depth;
-		ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
-		if (ret)
-			goto out;
-	}
+	ret = cma_modify_qp_rts(id_priv, conn_param);
+	if (ret)
+		goto out;
 
 	memset(&rep, 0, sizeof rep);
 	rep.qp_num = id_priv->qp_num;
@@ -2411,7 +2488,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
 	struct iw_cm_conn_param iw_param;
 	int ret;
 
-	ret = cma_modify_qp_rtr(id_priv);
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
 	if (ret)
 		return ret;
 
@@ -2432,10 +2509,14 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
 			     const void *private_data, int private_data_len)
 {
 	struct ib_cm_sidr_rep_param rep;
+	int ret;
 
 	memset(&rep, 0, sizeof rep);
 	rep.status = status;
 	if (status == IB_SIDR_SUCCESS) {
+		ret = cma_set_qkey(id_priv);
+		if (ret)
+			return ret;
 		rep.qp_num = id_priv->qp_num;
 		rep.qkey = id_priv->qkey;
 	}
@@ -2580,8 +2661,8 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 	int ret;
 
 	id_priv = mc->id_priv;
-	if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) &&
-	    cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
+	if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) &&
+	    cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
 		return 0;
 
 	mutex_lock(&id_priv->qp_mutex);
@@ -2606,12 +2687,12 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 	ret = id_priv->id.event_handler(&id_priv->id, &event);
 	if (ret) {
 		cma_exch(id_priv, CMA_DESTROYING);
-		cma_enable_remove(id_priv);
+		mutex_unlock(&id_priv->handler_mutex);
 		rdma_destroy_id(&id_priv->id);
 		return 0;
 	}
 
-	cma_enable_remove(id_priv);
+	mutex_unlock(&id_priv->handler_mutex);
 	return 0;
 }
 
@@ -2631,11 +2712,9 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
 		/* IPv6 address is an SA assigned MGID. */
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 	} else {
-		ip_ib_mc_map(sin->sin_addr.s_addr, mc_map);
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
 		if (id_priv->id.ps == RDMA_PS_UDP)
 			mc_map[7] = 0x01;	/* Use RDMA CM signature */
-		mc_map[8] = ib_addr_get_pkey(dev_addr) >> 8;
-		mc_map[9] = (unsigned char) ib_addr_get_pkey(dev_addr);
 		*mgid = *(union ib_gid *) (mc_map + 4);
 	}
 }
@@ -2654,7 +2733,7 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
 	if (ret)
 		return ret;
 
-	cma_set_mgid(id_priv, &mc->addr, &rec.mgid);
+	cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
 	if (id_priv->id.ps == RDMA_PS_UDP)
 		rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
 	ib_addr_get_sgid(dev_addr, &rec.port_gid);
@@ -2667,6 +2746,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
 		    IB_SA_MCMEMBER_REC_FLOW_LABEL |
 		    IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
 
+	if (id_priv->id.ps == RDMA_PS_IPOIB)
+		comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+			     IB_SA_MCMEMBER_REC_RATE_SELECTOR;
+
 	mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
 						id_priv->id.port_num, &rec,
 						comp_mask, GFP_KERNEL,
@@ -2745,6 +2828,62 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
 }
 EXPORT_SYMBOL(rdma_leave_multicast);
 
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct cma_ndev_work *work;
+
+	dev_addr = &id_priv->id.route.addr.dev_addr;
+
+	if ((dev_addr->src_dev == ndev) &&
+	    memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+		printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+		       ndev->name, &id_priv->id);
+		work = kzalloc(sizeof *work, GFP_KERNEL);
+		if (!work)
+			return -ENOMEM;
+
+		INIT_WORK(&work->work, cma_ndev_work_handler);
+		work->id = id_priv;
+		work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+		atomic_inc(&id_priv->refcount);
+		queue_work(cma_wq, &work->work);
+	}
+
+	return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+			       void *ctx)
+{
+	struct net_device *ndev = (struct net_device *)ctx;
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+	int ret = NOTIFY_DONE;
+
+	if (event != NETDEV_BONDING_FAILOVER)
+		return NOTIFY_DONE;
+
+	if (!(ndev->flags & IFF_MASTER))
+		return NOTIFY_DONE;
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			ret = cma_netdev_change(ndev, id_priv);
+			if (ret)
+				goto out;
+		}
+
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct notifier_block cma_nb = {
+	.notifier_call = cma_netdev_callback
+};
+
 static void cma_add_one(struct ib_device *device)
 {
 	struct cma_device *cma_dev;
@@ -2772,6 +2911,7 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv)
 {
 	struct rdma_cm_event event;
 	enum cma_state state;
+	int ret = 0;
 
 	/* Record that we want to remove the device */
 	state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
@@ -2779,15 +2919,18 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv)
 		return 0;
 
 	cma_cancel_operation(id_priv, state);
-	wait_event(id_priv->wait_remove, !atomic_read(&id_priv->dev_remove));
+	mutex_lock(&id_priv->handler_mutex);
 
 	/* Check for destruction from another callback. */
 	if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
-		return 0;
+		goto out;
 
 	memset(&event, 0, sizeof event);
 	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
-	return id_priv->id.event_handler(&id_priv->id, &event);
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
 }
 
 static void cma_process_remove(struct cma_device *cma_dev)
@@ -2849,6 +2992,7 @@ static int cma_init(void)
 
 	ib_sa_register_client(&sa_client);
 	rdma_addr_register_client(&addr_client);
+	register_netdevice_notifier(&cma_nb);
 
 	ret = ib_register_client(&cma_client);
 	if (ret)
@@ -2856,6 +3000,7 @@ static int cma_init(void)
 	return 0;
 
 err:
+	unregister_netdevice_notifier(&cma_nb);
 	rdma_addr_unregister_client(&addr_client);
 	ib_sa_unregister_client(&sa_client);
 	destroy_workqueue(cma_wq);
@@ -2865,6 +3010,7 @@ err:
 static void cma_cleanup(void)
 {
 	ib_unregister_client(&cma_client);
+	unregister_netdevice_notifier(&cma_nb);
 	rdma_addr_unregister_client(&addr_client);
 	ib_sa_unregister_client(&sa_client);
 	destroy_workqueue(cma_wq);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 7ad47a4..05ac36e 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: core_priv.h 1349 2004-12-16 21:09:43Z roland $
  */
 
 #ifndef _CORE_PRIV_H
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 858b6ea..179f753 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: device.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/module.h>
@@ -180,9 +178,14 @@ static int end_port(struct ib_device *device)
  */
 struct ib_device *ib_alloc_device(size_t size)
 {
+	struct ib_device *ibdev;
+
 	BUG_ON(size < sizeof (struct ib_device));
 
-	return kzalloc(size, GFP_KERNEL);
+	ibdev = kzalloc(size, GFP_KERNEL);
+	if (ibdev)
+		mutex_init(&ibdev->sysfs_mutex);
+	return ibdev;
 }
 EXPORT_SYMBOL(ib_alloc_device);
 
@@ -313,9 +316,10 @@ int ib_register_device(struct ib_device *device)
 		goto out;
 	}
 
+	mutex_lock(&device->sysfs_mutex);
 	list_add_tail(&device->core_list, &device_list);
-
 	device->reg_state = IB_DEV_REGISTERED;
+	mutex_unlock(&device->sysfs_mutex);
 
 	{
 		struct ib_client *client;
@@ -361,7 +365,9 @@ void ib_unregister_device(struct ib_device *device)
 		kfree(context);
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
 
+	mutex_lock(&device->sysfs_mutex);
 	device->reg_state = IB_DEV_UNREGISTERED;
+	mutex_unlock(&device->sysfs_mutex);
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
@@ -727,6 +733,10 @@ static int __init ib_core_init(void)
 		dma_map_sg_hp_wa = 1;
 #endif
 
+	ret = init_mnt_writers();
+	if (ret)
+		printk(KERN_WARNING "Couldn't init mnt_writers\n");
+
 	ret = ib_sysfs_setup();
 	if (ret)
 		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
index bbfc713..4507043 100644
--- a/drivers/infiniband/core/fmr_pool.c
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: fmr_pool.c 2730 2005-06-28 16:43:03Z sean.hefty $
  */
 
 #include <linux/errno.h>
@@ -158,8 +156,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
 #endif
 	}
 
-	list_splice(&pool->dirty_list, &unmap_list);
-	INIT_LIST_HEAD(&pool->dirty_list);
+	list_splice_init(&pool->dirty_list, &unmap_list);
 	pool->dirty_len = 0;
 
 	spin_unlock_irq(&pool->pool_lock);
@@ -182,8 +179,7 @@ static int ib_fmr_cleanup_thread(void *pool_ptr)
 	struct ib_fmr_pool *pool = pool_ptr;
 
 	do {
-		if (pool->dirty_len >= pool->dirty_watermark ||
-		    atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
 			ib_fmr_batch_release(pool);
 
 			atomic_inc(&pool->flush_ser);
@@ -194,8 +190,7 @@ static int ib_fmr_cleanup_thread(void *pool_ptr)
 		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (pool->dirty_len < pool->dirty_watermark &&
-		    atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
 		    !kthread_should_stop())
 			schedule();
 		__set_current_state(TASK_RUNNING);
@@ -308,10 +303,13 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
 			.max_maps   = pool->max_remaps,
 			.page_shift = params->page_shift
 		};
+		int bytes_per_fmr = sizeof *fmr;
+
+		if (pool->cache_bucket)
+			bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
 
 		for (i = 0; i < params->pool_size; ++i) {
-			fmr = kmalloc(sizeof *fmr + params->max_pages_per_fmr * sizeof (u64),
-				      GFP_KERNEL);
+			fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
 			if (!fmr) {
 				printk(KERN_WARNING PFX "failed to allocate fmr "
 				       "struct for FMR %d\n", i);
@@ -526,8 +524,10 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
 			list_add_tail(&fmr->list, &pool->free_list);
 		} else {
 			list_add_tail(&fmr->list, &pool->dirty_list);
-			++pool->dirty_len;
-			wake_up_process(pool->thread);
+			if (++pool->dirty_len >= pool->dirty_watermark) {
+				atomic_inc(&pool->req_ser);
+				wake_up_process(pool->thread);
+			}
 		}
 	}
 
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 81c9195..8f9509e 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -942,8 +942,7 @@ static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
 	case IW_CM_STATE_CONN_RECV:
 	case IW_CM_STATE_ESTABLISHED:
 		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
-		qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
-					   IB_ACCESS_REMOTE_WRITE|
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
 					   IB_ACCESS_REMOTE_READ;
 		ret = 0;
 		break;
diff --git a/drivers/infiniband/core/local_sa.c b/drivers/infiniband/core/local_sa.c
index 49ca262..eb62c42 100644
--- a/drivers/infiniband/core/local_sa.c
+++ b/drivers/infiniband/core/local_sa.c
@@ -978,7 +978,7 @@ static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter,
 static void report_path(struct work_struct *work)
 {
 	struct sa_path_request *req;
-	
+
 	req = container_of(work, struct sa_path_request, work);
 	req->callback(0, &req->path_rec, req->context);
 	ib_sa_client_put(req->client);
@@ -1058,7 +1058,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
 	req->context = context;
 
 	ib_sa_client_get(client);
-	queue_work(sa_wq, &req->work);	
+	queue_work(sa_wq, &req->work);
 	*sa_query = ERR_PTR(-EEXIST);
 	return 0;
 
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 4402d03..7934a4c 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -301,6 +301,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 	mad_agent_priv->agent.context = context;
 	mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
 	mad_agent_priv->agent.port_num = port_num;
+	spin_lock_init(&mad_agent_priv->lock);
+	INIT_LIST_HEAD(&mad_agent_priv->send_list);
+	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+	INIT_LIST_HEAD(&mad_agent_priv->done_list);
+	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+	INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
+	INIT_LIST_HEAD(&mad_agent_priv->local_list);
+	INIT_WORK(&mad_agent_priv->local_work, local_completions);
+	atomic_set(&mad_agent_priv->refcount, 1);
+	init_completion(&mad_agent_priv->comp);
 
 	spin_lock_irqsave(&port_priv->reg_lock, flags);
 	mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
@@ -350,17 +360,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 	list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
 	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
 
-	spin_lock_init(&mad_agent_priv->lock);
-	INIT_LIST_HEAD(&mad_agent_priv->send_list);
-	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
-	INIT_LIST_HEAD(&mad_agent_priv->done_list);
-	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
-	INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
-	INIT_LIST_HEAD(&mad_agent_priv->local_list);
-	INIT_WORK(&mad_agent_priv->local_work, local_completions);
-	atomic_set(&mad_agent_priv->refcount, 1);
-	init_completion(&mad_agent_priv->comp);
-
 	return &mad_agent_priv->agent;
 
 error4:
@@ -747,9 +746,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 		break;
 	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
 		kmem_cache_free(ib_mad_cache, mad_priv);
-		kfree(local);
-		ret = 1;
-		goto out;
+		break;
 	case IB_MAD_RESULT_SUCCESS:
 		/* Treat like an incoming receive MAD */
 		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
@@ -760,10 +757,12 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 						        &mad_priv->mad.mad);
 		}
 		if (!port_priv || !recv_mad_agent) {
+			/*
+			 * No receiving agent so drop packet and
+			 * generate send completion.
+			 */
 			kmem_cache_free(ib_mad_cache, mad_priv);
-			kfree(local);
-			ret = 0;
-			goto out;
+			break;
 		}
 		local->mad_priv = mad_priv;
 		local->recv_mad_agent = recv_mad_agent;
@@ -1102,7 +1101,9 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
-		mad_send_wr->retries = send_buf->retries;
+		mad_send_wr->max_retries = send_buf->retries;
+		mad_send_wr->retries_left = send_buf->retries;
+		send_buf->retries = 0;
 		/* Reference for work request to QP + response */
 		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
 		mad_send_wr->status = IB_WC_SUCCESS;
@@ -1695,9 +1696,8 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
 	u8 port_num = mad_agent_priv->agent.port_num;
 	u8 lmc;
 
-	send_resp = ((struct ib_mad *)(wr->send_buf.mad))->
-		     mad_hdr.method & IB_MGMT_METHOD_RESP;
-	rcv_resp = rwc->recv_buf.mad->mad_hdr.method & IB_MGMT_METHOD_RESP;
+	send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
+	rcv_resp = ib_response_mad(rwc->recv_buf.mad);
 
 	if (send_resp == rcv_resp)
 		/* both requests, or both responses. GIDs different */
@@ -1933,15 +1933,6 @@ local:
 	if (port_priv->device->process_mad) {
 		int ret;
 
-		if (!response) {
-			printk(KERN_ERR PFX "No memory for response MAD\n");
-			/*
-			 * Is it better to assume that
-			 * it wouldn't be processed ?
-			 */
-			goto out;
-		}
-
 		ret = port_priv->device->process_mad(port_priv->device, 0,
 						     port_priv->port_num,
 						     wc, &recv->grh,
@@ -2368,7 +2359,7 @@ static void local_completions(struct work_struct *work)
 	struct ib_mad_local_private *local;
 	struct ib_mad_agent_private *recv_mad_agent;
 	unsigned long flags;
-	int recv = 0;
+	int free_mad;
 	struct ib_wc wc;
 	struct ib_mad_send_wc mad_send_wc;
 
@@ -2382,14 +2373,15 @@ static void local_completions(struct work_struct *work)
 				   completion_list);
 		list_del(&local->completion_list);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		free_mad = 0;
 		if (local->mad_priv) {
 			recv_mad_agent = local->recv_mad_agent;
 			if (!recv_mad_agent) {
 				printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+				free_mad = 1;
 				goto local_send_completion;
 			}
 
-			recv = 1;
 			/*
 			 * Defined behavior is to complete response
 			 * before request
@@ -2434,7 +2426,7 @@ local_send_completion:
 
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		atomic_dec(&mad_agent_priv->refcount);
-		if (!recv)
+		if (free_mad)
 			kmem_cache_free(ib_mad_cache, local->mad_priv);
 		kfree(local);
 	}
@@ -2445,9 +2437,12 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;
 
-	if (!mad_send_wr->retries--)
+	if (!mad_send_wr->retries_left)
 		return -ETIMEDOUT;
 
+	mad_send_wr->retries_left--;
+	mad_send_wr->send_buf.retries++;
+
 	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
 
 	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 9be5cc0..05ce331 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mad_priv.h 5596 2006-03-03 01:00:07Z sean.hefty $
  */
 
 #ifndef __IB_MAD_PRIV_H__
@@ -131,7 +129,8 @@ struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
-	int retries;
+	int max_retries;
+	int retries_left;
 	int retry;
 	int refcount;
 	enum ib_wc_status status;
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index d43bc62..3af2b84 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mad_rmpp.c 1921 2005-03-02 22:58:44Z sean.hefty $
  */
 
 #include "mad_priv.h"
@@ -135,7 +133,7 @@ static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
 	msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
 				 recv_wc->wc->pkey_index, 1, hdr_len,
 				 0, GFP_KERNEL);
-	if (!msg)
+	if (IS_ERR(msg))
 		return;
 
 	format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
@@ -684,7 +682,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
 
 	if (seg_num > mad_send_wr->last_ack) {
 		adjust_last_ack(mad_send_wr, seg_num);
-		mad_send_wr->retries = mad_send_wr->send_buf.retries;
+		mad_send_wr->retries_left = mad_send_wr->max_retries;
 	}
 	mad_send_wr->newwin = newwin;
 	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
index f0616fd..3d336bf 100644
--- a/drivers/infiniband/core/mad_rmpp.h
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: mad_rmpp.h 1921 2005-02-25 22:58:44Z sean.hefty $
  */
 
 #ifndef __MAD_RMPP_H__
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index dda056c..ca95426 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -73,11 +73,20 @@ struct mcast_device {
 };
 
 enum mcast_state {
-	MCAST_IDLE,
 	MCAST_JOINING,
 	MCAST_MEMBER,
+	MCAST_ERROR,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
 	MCAST_BUSY,
-	MCAST_ERROR
+	MCAST_GROUP_ERROR,
+	MCAST_PKEY_EVENT
+};
+
+enum {
+	MCAST_INVALID_PKEY_INDEX = 0xFFFF
 };
 
 struct mcast_member;
@@ -93,9 +102,12 @@ struct mcast_group {
 	struct mcast_member	*last_join;
 	int			members[3];
 	atomic_t		refcount;
-	enum mcast_state	state;
+	enum mcast_group_state	state;
 	struct ib_sa_query	*query;
 	int			query_id;
+	u16			pkey_index;
+	u8			leave_state;
+	int			retries;
 };
 
 struct mcast_member {
@@ -312,6 +324,7 @@ static int send_leave(struct mcast_group *group, u8 leave_state)
 
 	rec = group->rec;
 	rec.join_state = leave_state;
+	group->leave_state = leave_state;
 
 	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
 				       port->port_num, IB_SA_METHOD_DELETE, &rec,
@@ -350,9 +363,19 @@ static int fail_join(struct mcast_group *group, struct mcast_member *member,
 static void process_group_error(struct mcast_group *group)
 {
 	struct mcast_member *member;
-	int ret;
+	int ret = 0;
+	u16 pkey_index;
+
+	if (group->state == MCAST_PKEY_EVENT)
+		ret = ib_find_pkey(group->port->dev->device,
+				   group->port->port_num,
+				   be16_to_cpu(group->rec.pkey), &pkey_index);
 
 	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_PKEY_EVENT && !ret &&
+	    group->pkey_index == pkey_index)
+		goto out;
+
 	while (!list_empty(&group->active_list)) {
 		member = list_entry(group->active_list.next,
 				    struct mcast_member, list);
@@ -371,6 +394,7 @@ static void process_group_error(struct mcast_group *group)
 	}
 
 	group->rec.join_state = 0;
+out:
 	group->state = MCAST_BUSY;
 	spin_unlock_irq(&group->lock);
 }
@@ -387,9 +411,9 @@ static void mcast_work_handler(struct work_struct *work)
 retest:
 	spin_lock_irq(&group->lock);
 	while (!list_empty(&group->pending_list) ||
-	       (group->state == MCAST_ERROR)) {
+	       (group->state != MCAST_BUSY)) {
 
-		if (group->state == MCAST_ERROR) {
+		if (group->state != MCAST_BUSY) {
 			spin_unlock_irq(&group->lock);
 			process_group_error(group);
 			goto retest;
@@ -466,12 +490,19 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
 			 void *context)
 {
 	struct mcast_group *group = context;
+	u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
 
 	if (status)
 		process_join_error(group, status);
 	else {
+		ib_find_pkey(group->port->dev->device, group->port->port_num,
+			     be16_to_cpu(rec->pkey), &pkey_index);
+
 		spin_lock_irq(&group->port->lock);
 		group->rec = *rec;
+		if (group->state == MCAST_BUSY &&
+		    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+			group->pkey_index = pkey_index;
 		if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
 			rb_erase(&group->node, &group->port->table);
 			mcast_insert(group->port, group, 1);
@@ -486,7 +517,11 @@ static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
 {
 	struct mcast_group *group = context;
 
-	mcast_work_handler(&group->work);
+	if (status && (group->retries > 0) &&
+	    !send_leave(group, group->leave_state))
+		group->retries--;
+	else
+		mcast_work_handler(&group->work);
 }
 
 static struct mcast_group *acquire_group(struct mcast_port *port,
@@ -509,8 +544,10 @@ static struct mcast_group *acquire_group(struct mcast_port *port,
 	if (!group)
 		return NULL;
 
+	group->retries = 3;
 	group->port = port;
 	group->rec.mgid = *mgid;
+	group->pkey_index = MCAST_INVALID_PKEY_INDEX;
 	INIT_LIST_HEAD(&group->pending_list);
 	INIT_LIST_HEAD(&group->active_list);
 	INIT_WORK(&group->work, mcast_work_handler);
@@ -679,7 +716,8 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
 }
 EXPORT_SYMBOL(ib_init_ah_from_mcmember);
 
-static void mcast_groups_lost(struct mcast_port *port)
+static void mcast_groups_event(struct mcast_port *port,
+			       enum mcast_group_state state)
 {
 	struct mcast_group *group;
 	struct rb_node *node;
@@ -693,7 +731,8 @@ static void mcast_groups_lost(struct mcast_port *port)
 			atomic_inc(&group->refcount);
 			queue_work(mcast_wq, &group->work);
 		}
-		group->state = MCAST_ERROR;
+		if (group->state != MCAST_GROUP_ERROR)
+			group->state = state;
 		spin_unlock(&group->lock);
 	}
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -703,16 +742,20 @@ static void mcast_event_handler(struct ib_event_handler *handler,
 				struct ib_event *event)
 {
 	struct mcast_device *dev;
+	int index;
 
 	dev = container_of(handler, struct mcast_device, event_handler);
+	index = event->element.port_num - dev->start_port;
 
 	switch (event->event) {
 	case IB_EVENT_PORT_ERR:
 	case IB_EVENT_LID_CHANGE:
 	case IB_EVENT_SM_CHANGE:
 	case IB_EVENT_CLIENT_REREGISTER:
-		mcast_groups_lost(&dev->port[event->element.port_num -
-					     dev->start_port]);
+		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+		break;
+	case IB_EVENT_PKEY_CHANGE:
+		mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
 		break;
 	default:
 		break;
diff --git a/drivers/infiniband/core/namespace.c b/drivers/infiniband/core/namespace.c
new file mode 100644
index 0000000..a33f9e9
--- /dev/null
+++ b/drivers/infiniband/core/namespace.c
@@ -0,0 +1,95 @@
+#include <linux/spinlock_types.h>
+#include <linux/percpu.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+
+struct mnt_writer {
+	/*
+	 * If holding multiple instances of this lock, they
+	 * must be ordered by cpu number.
+	 */
+	spinlock_t lock;
+	struct lock_class_key lock_class; /* compiles out with !lockdep */
+	unsigned long count;
+	struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+int __init init_mnt_writers(void)
+{
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+		spin_lock_init(&writer->lock);
+		lockdep_set_class(&writer->lock, &writer->lock_class);
+		writer->count = 0;
+	}
+	return 0;
+}
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+	if (!cpu_writer->mnt)
+		return;
+	/*
+	 * This is in case anyone ever leaves an invalid,
+	 * old ->mnt and a count of 0.
+	 */
+	if (!cpu_writer->count)
+		return;
+	cpu_writer->count = 0;
+}
+
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+					  struct vfsmount *mnt)
+{
+	if (cpu_writer->mnt == mnt)
+		return;
+	__clear_mnt_count(cpu_writer);
+	cpu_writer->mnt = mnt;
+}
+
+int mnt_want_write(struct vfsmount *mnt)
+{
+	int ret = 0;
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+	if (__mnt_is_readonly(mnt)) {
+		ret = -EROFS;
+		goto out;
+	}
+	use_cpu_writer_for_mount(cpu_writer, mnt);
+	cpu_writer->count++;
+out:
+	spin_unlock(&cpu_writer->lock);
+	put_cpu_var(mnt_writers);
+	return ret;
+}
+EXPORT_SYMBOL(mnt_want_write);
+
+void mnt_drop_write(struct vfsmount *mnt)
+{
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+
+	use_cpu_writer_for_mount(cpu_writer, mnt);
+	if (cpu_writer->count > 0) {
+		cpu_writer->count--;
+	}
+
+	spin_unlock(&cpu_writer->lock);
+	/*
+	 * This could be done right after the spinlock
+	 * is taken because the spinlock keeps us on
+	 * the cpu, and disables preemption.  However,
+	 * putting it here bounds the amount that
+	 * __mnt_writers can underflow.  Without it,
+	 * we could theoretically wrap __mnt_writers.
+	 */
+	put_cpu_var(mnt_writers);
+}
+EXPORT_SYMBOL(mnt_drop_write);
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
index c972d72..019bd4b 100644
--- a/drivers/infiniband/core/packer.c
+++ b/drivers/infiniband/core/packer.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: packer.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/string.h>
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index a2b1c4a..59ebdd7 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: sa_query.c 2811 2005-07-06 18:11:43Z halr $
  */
 
 #include <linux/module.h>
@@ -561,6 +559,8 @@ static void update_sm_ah(struct work_struct *work)
 	}
 
 	spin_lock_irq(&port->ah_lock);
+	if (port->sm_ah)
+		kref_put(&port->sm_ah->ref, free_sm_ah);
 	port->sm_ah = new_ah;
 	spin_unlock_irq(&port->ah_lock);
 
@@ -1188,6 +1188,7 @@ static void ib_sa_notice_resp(struct ib_sa_port *port,
 	struct ib_mad_send_buf *mad_buf;
 	struct ib_sa_mad *mad;
 	int ret;
+	unsigned long flags;
 
 	mad_buf = ib_create_send_mad(port->notice_agent, 1, 0, 0,
 				     IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
@@ -1199,11 +1200,16 @@ static void ib_sa_notice_resp(struct ib_sa_port *port,
 	memcpy(mad, mad_recv_wc->recv_buf.mad, sizeof *mad);
 	mad->mad_hdr.method = IB_MGMT_METHOD_REPORT_RESP;
 
-	spin_lock_irq(&port->ah_lock);
+	spin_lock_irqsave(&port->ah_lock, flags);
+	if (!port->sm_ah) {
+		spin_unlock_irqrestore(&port->ah_lock, flags);
+		ib_free_send_mad(mad_buf);
+		return;
+	}
 	kref_get(&port->sm_ah->ref);
 	mad_buf->context[0] = &port->sm_ah->ref;
 	mad_buf->ah = port->sm_ah->ah;
-	spin_unlock_irq(&port->ah_lock);
+	spin_unlock_irqrestore(&port->ah_lock, flags);
 
 	ret = ib_post_send_mad(mad_buf, NULL);
 	if (ret)
@@ -1392,7 +1398,8 @@ static void ib_sa_remove_one(struct ib_device *device)
 	for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
 		ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
 		ib_unregister_mad_agent(sa_dev->port[i].agent);
-		kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+		if (sa_dev->port[i].sm_ah)
+			kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
 	}
 
 	kfree(sa_dev);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 70b77ae..1091946 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: sysfs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include "core_priv.h"
@@ -96,7 +94,7 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
 			  char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
+	ssize_t ret = -ENODEV;
 
 	static const char *state_name[] = {
 		[IB_PORT_NOP]		= "NOP",
@@ -107,26 +105,33 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
 		[IB_PORT_ACTIVE_DEFER]	= "ACTIVE_DEFER"
 	};
 
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%d: %s\n", attr.state,
-		       attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
-		       state_name[attr.state] : "UNKNOWN");
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "%d: %s\n", attr.state,
+				      attr.state >= 0 &&
+				      attr.state < ARRAY_SIZE(state_name) ?
+				      state_name[attr.state] : "UNKNOWN");
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
 			char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
+	ssize_t ret = -ENODEV;
 
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "0x%x\n", attr.lid);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "0x%x\n", attr.lid);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t lid_mask_count_show(struct ib_port *p,
@@ -134,52 +139,64 @@ static ssize_t lid_mask_count_show(struct ib_port *p,
 				   char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "%d\n", attr.lmc);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "%d\n", attr.lmc);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
 			   char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "0x%x\n", attr.sm_lid);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "0x%x\n", attr.sm_lid);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
 			  char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
+	ssize_t ret = -ENODEV;
 
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%d\n", attr.sm_sl);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "%d\n", attr.sm_sl);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
 			     char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
@@ -188,24 +205,33 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
 	struct ib_port_attr attr;
 	char *speed = "";
 	int rate;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
-
-	switch (attr.active_speed) {
-	case 2: speed = " DDR"; break;
-	case 4: speed = " QDR"; break;
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret) {
+			switch (attr.active_speed) {
+			case 2: speed = " DDR"; break;
+			case 4: speed = " QDR"; break;
+			}
+
+			rate = 25 * ib_width_enum_to_int(attr.active_width) *
+				attr.active_speed;
+			if (rate < 0) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			ret = sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+				      rate / 10, rate % 10 ? ".5" : "",
+				      ib_width_enum_to_int(attr.active_width),
+				      speed);
+		}
 	}
-
-	rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed;
-	if (rate < 0)
-		return -EINVAL;
-
-	return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
-		       rate / 10, rate % 10 ? ".5" : "",
-		       ib_width_enum_to_int(attr.active_width), speed);
+out:
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
@@ -213,22 +239,26 @@ static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
 {
 	struct ib_port_attr attr;
 
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
-
-	switch (attr.phys_state) {
-	case 1:  return sprintf(buf, "1: Sleep\n");
-	case 2:  return sprintf(buf, "2: Polling\n");
-	case 3:  return sprintf(buf, "3: Disabled\n");
-	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
-	case 5:  return sprintf(buf, "5: LinkUp\n");
-	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
-	case 7:  return sprintf(buf, "7: Phy Test\n");
-	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret) {
+			switch (attr.phys_state) {
+			case 1:  ret = sprintf(buf, "1: Sleep\n");
+			case 2:  ret = sprintf(buf, "2: Polling\n");
+			case 3:  ret = sprintf(buf, "3: Disabled\n");
+			case 4:  ret = sprintf(buf, "4: PortConfigurationTraining\n");
+			case 5:  ret = sprintf(buf, "5: LinkUp\n");
+			case 6:  ret = sprintf(buf, "6: LinkErrorRecovery\n");
+			case 7:  ret = sprintf(buf, "7: Phy Test\n");
+			default: ret = sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+			}
+		}
 	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static PORT_ATTR_RO(state);
@@ -258,21 +288,24 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
 	struct port_table_attribute *tab_attr =
 		container_of(attr, struct port_table_attribute, attr);
 	union ib_gid gid;
-	ssize_t ret;
-
-	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
-		       be16_to_cpu(((__be16 *) gid.raw)[0]),
-		       be16_to_cpu(((__be16 *) gid.raw)[1]),
-		       be16_to_cpu(((__be16 *) gid.raw)[2]),
-		       be16_to_cpu(((__be16 *) gid.raw)[3]),
-		       be16_to_cpu(((__be16 *) gid.raw)[4]),
-		       be16_to_cpu(((__be16 *) gid.raw)[5]),
-		       be16_to_cpu(((__be16 *) gid.raw)[6]),
-		       be16_to_cpu(((__be16 *) gid.raw)[7]));
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+		if (!ret)
+			ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+				      be16_to_cpu(((__be16 *) gid.raw)[0]),
+				      be16_to_cpu(((__be16 *) gid.raw)[1]),
+				      be16_to_cpu(((__be16 *) gid.raw)[2]),
+				      be16_to_cpu(((__be16 *) gid.raw)[3]),
+				      be16_to_cpu(((__be16 *) gid.raw)[4]),
+				      be16_to_cpu(((__be16 *) gid.raw)[5]),
+				      be16_to_cpu(((__be16 *) gid.raw)[6]),
+				      be16_to_cpu(((__be16 *) gid.raw)[7]));
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
@@ -281,13 +314,16 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
 	struct port_table_attribute *tab_attr =
 		container_of(attr, struct port_table_attribute, attr);
 	u16 pkey;
-	ssize_t ret;
-
-	ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "0x%04x\n", pkey);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+		if (!ret)
+			ret = sprintf(buf, "0x%04x\n", pkey);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 #define PORT_PMA_ATTR(_name, _counter, _width, _offset)			\
@@ -310,6 +346,12 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
 	if (!p->ibdev->process_mad)
 		return sprintf(buf, "N/A (no PMA)\n");
 
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (!ibdev_is_alive(p->ibdev)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
 	if (!in_mad || !out_mad) {
@@ -356,7 +398,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
 out:
 	kfree(in_mad);
 	kfree(out_mad);
-
+	mutex_unlock(&p->ibdev->sysfs_mutex);
 	return ret;
 }
 
@@ -376,6 +418,12 @@ static PORT_PMA_ATTR(port_xmit_data		    , 12, 32, 192);
 static PORT_PMA_ATTR(port_rcv_data		    , 13, 32, 224);
 static PORT_PMA_ATTR(port_xmit_packets		    , 14, 32, 256);
 static PORT_PMA_ATTR(port_rcv_packets		    , 15, 32, 288);
+/*
+ * There is no bit allocated for port_xmit_wait in the CounterSelect field
+ * (IB spec). However, since this bit is ignored when reading
+ * (show_pma_counter), the _counter field of port_xmit_wait can be set to zero.
+ */
+static PORT_PMA_ATTR(port_xmit_wait		    ,  0, 32, 320);
 
 static struct attribute *pma_attrs[] = {
 	&port_pma_attr_symbol_error.attr.attr,
@@ -394,6 +442,7 @@ static struct attribute *pma_attrs[] = {
 	&port_pma_attr_port_rcv_data.attr.attr,
 	&port_pma_attr_port_xmit_packets.attr.attr,
 	&port_pma_attr_port_rcv_packets.attr.attr,
+	&port_pma_attr_port_xmit_wait.attr.attr,
 	NULL
 };
 
@@ -511,19 +560,10 @@ static int add_port(struct ib_device *device, int port_num)
 
 	p->ibdev      = device;
 	p->port_num   = port_num;
-	p->kobj.ktype = &port_type;
-
-	p->kobj.parent = kobject_get(&device->ports_parent);
-	if (!p->kobj.parent) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	ret = kobject_set_name(&p->kobj, "%d", port_num);
-	if (ret)
-		goto err_put;
 
-	ret = kobject_register(&p->kobj);
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   kobject_get(device->ports_parent),
+				   "%d", port_num);
 	if (ret)
 		goto err_put;
 
@@ -552,6 +592,7 @@ static int add_port(struct ib_device *device, int port_num)
 
 	list_add_tail(&p->kobj.entry, &device->port_list);
 
+	kobject_uevent(&p->kobj, KOBJ_ADD);
 	return 0;
 
 err_free_pkey:
@@ -573,9 +614,7 @@ err_remove_pma:
 	sysfs_remove_group(&p->kobj, &pma_group);
 
 err_put:
-	kobject_put(&device->ports_parent);
-
-err:
+	kobject_put(device->ports_parent);
 	kfree(p);
 	return ret;
 }
@@ -600,20 +639,20 @@ static ssize_t show_sys_image_guid(struct class_device *cdev, char *buf)
 {
 	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
 	struct ib_device_attr attr;
-	ssize_t ret;
-
-	if (!ibdev_is_alive(dev))
-		return -ENODEV;
-
-	ret = ib_query_device(dev, &attr);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&dev->sysfs_mutex);
+	if (ibdev_is_alive(dev)) {
+		ret = ib_query_device(dev, &attr);
+		if (!ret)
+			ret = sprintf(buf, "%04x:%04x:%04x:%04x\n",
+				      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
+				      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
+				      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
+				      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+	}
+	mutex_unlock(&dev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t show_node_guid(struct class_device *cdev, char *buf)
@@ -642,24 +681,26 @@ static ssize_t set_node_desc(struct class_device *cdev, const char *buf,
 {
 	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
 	struct ib_device_modify desc = {};
-	int ret;
+	int ret = -ENODEV;
 
 	if (!dev->modify_device)
 		return -EIO;
 
 	memcpy(desc.node_desc, buf, min_t(int, count, 64));
-	ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
-	if (ret)
-		return ret;
-
-	return count;
+	mutex_lock(&dev->sysfs_mutex);
+	if (ibdev_is_alive(dev)) {
+		ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+		if (!ret)
+			ret = count;
+	}
+	mutex_unlock(&dev->sysfs_mutex);
+	return ret;
 }
 
 static CLASS_DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
 static CLASS_DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
 static CLASS_DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
-static CLASS_DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc,
-			 set_node_desc);
+static CLASS_DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
 
 static struct class_device_attribute *ib_class_attributes[] = {
 	&class_device_attr_node_type,
@@ -674,6 +715,124 @@ static struct class ib_class = {
 	.uevent = ib_device_uevent,
 };
 
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(struct class_device *cdev,
+			    char *buf,
+			    unsigned offset)
+{
+	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+	union rdma_protocol_stats stats;
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&dev->sysfs_mutex);
+	if (ibdev_is_alive(dev)) {
+		ret = dev->get_protocol_stats(dev, &stats);
+		if (!ret)
+			ret = sprintf(buf, "%llu\n",
+				      (unsigned long long)
+				      ((u64 *) &stats)[offset]);
+	}
+	mutex_unlock(&dev->sysfs_mutex);
+	return ret;
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name)						\
+static ssize_t show_##name(struct class_device *cdev,			\
+			   char *buf)					\
+{									\
+	return show_protocol_stat(cdev, buf,			\
+				  offsetof(struct iw_protocol_stats, name) / \
+				  sizeof (u64));			\
+}									\
+static CLASS_DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+	&class_device_attr_ipInReceives.attr,
+	&class_device_attr_ipInHdrErrors.attr,
+	&class_device_attr_ipInTooBigErrors.attr,
+	&class_device_attr_ipInNoRoutes.attr,
+	&class_device_attr_ipInAddrErrors.attr,
+	&class_device_attr_ipInUnknownProtos.attr,
+	&class_device_attr_ipInTruncatedPkts.attr,
+	&class_device_attr_ipInDiscards.attr,
+	&class_device_attr_ipInDelivers.attr,
+	&class_device_attr_ipOutForwDatagrams.attr,
+	&class_device_attr_ipOutRequests.attr,
+	&class_device_attr_ipOutDiscards.attr,
+	&class_device_attr_ipOutNoRoutes.attr,
+	&class_device_attr_ipReasmTimeout.attr,
+	&class_device_attr_ipReasmReqds.attr,
+	&class_device_attr_ipReasmOKs.attr,
+	&class_device_attr_ipReasmFails.attr,
+	&class_device_attr_ipFragOKs.attr,
+	&class_device_attr_ipFragFails.attr,
+	&class_device_attr_ipFragCreates.attr,
+	&class_device_attr_ipInMcastPkts.attr,
+	&class_device_attr_ipOutMcastPkts.attr,
+	&class_device_attr_ipInBcastPkts.attr,
+	&class_device_attr_ipOutBcastPkts.attr,
+	&class_device_attr_tcpRtoAlgorithm.attr,
+	&class_device_attr_tcpRtoMin.attr,
+	&class_device_attr_tcpRtoMax.attr,
+	&class_device_attr_tcpMaxConn.attr,
+	&class_device_attr_tcpActiveOpens.attr,
+	&class_device_attr_tcpPassiveOpens.attr,
+	&class_device_attr_tcpAttemptFails.attr,
+	&class_device_attr_tcpEstabResets.attr,
+	&class_device_attr_tcpCurrEstab.attr,
+	&class_device_attr_tcpInSegs.attr,
+	&class_device_attr_tcpOutSegs.attr,
+	&class_device_attr_tcpRetransSegs.attr,
+	&class_device_attr_tcpInErrs.attr,
+	&class_device_attr_tcpOutRsts.attr,
+	NULL
+};
+
+static struct attribute_group iw_stats_group = {
+	.name	= "proto_stats",
+	.attrs	= iw_proto_stats_attrs,
+};
+
 int ib_device_register_sysfs(struct ib_device *device)
 {
 	struct class_device *class_dev = &device->class_dev;
@@ -697,17 +856,12 @@ int ib_device_register_sysfs(struct ib_device *device)
 			goto err_unregister;
 	}
 
-	device->ports_parent.parent = kobject_get(&class_dev->kobj);
-	if (!device->ports_parent.parent) {
-		ret = -EBUSY;
-		goto err_unregister;
-	}
-	ret = kobject_set_name(&device->ports_parent, "ports");
-	if (ret)
-		goto err_put;
-	ret = kobject_register(&device->ports_parent);
-	if (ret)
+	device->ports_parent = kobject_create_and_add("ports",
+					kobject_get(&class_dev->kobj));
+	if (!device->ports_parent) {
+		ret = -ENOMEM;
 		goto err_put;
+	}
 
 	if (device->node_type == RDMA_NODE_IB_SWITCH) {
 		ret = add_port(device, 0);
@@ -721,6 +875,12 @@ int ib_device_register_sysfs(struct ib_device *device)
 		}
 	}
 
+	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+		ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+		if (ret)
+			goto err_put;
+	}
+
 	return 0;
 
 err_put:
@@ -738,7 +898,7 @@ err_put:
 		}
 	}
 
-	kobject_put(&class_dev->kobj);
+	kobject_unregister(&class_dev->kobj);
 
 err_unregister:
 	class_device_unregister(class_dev);
@@ -761,7 +921,9 @@ void ib_device_unregister_sysfs(struct ib_device *device)
 		kobject_unregister(p);
 	}
 
-	kobject_unregister(&device->ports_parent);
+	kobject_put(device->ports_parent);
+	/* WA for memory leak */
+	kfree(device->ports_parent);
 	class_device_unregister(&device->class_dev);
 }
 
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 424983f..1c967e1 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ucm.c 4311 2005-12-05 18:42:01Z sean.hefty $
  */
 
 #include <linux/completion.h>
@@ -106,6 +104,9 @@ enum {
 	IB_UCM_MAX_DEVICES = 32
 };
 
+/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */
+extern struct class cm_class;
+
 #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
 
 static void ib_ucm_add_one(struct ib_device *device);
@@ -1152,6 +1153,14 @@ static unsigned int ib_ucm_poll(struct file *filp,
 	return mask;
 }
 
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
 static int ib_ucm_open(struct inode *inode, struct file *filp)
 {
 	struct ib_ucm_file *file;
@@ -1199,7 +1208,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static void ib_ucm_release_class_dev(struct class_device *class_dev)
+static void ucm_release_class_dev(struct class_device *class_dev)
 {
 	struct ib_ucm_device *dev;
 
@@ -1217,11 +1226,6 @@ static const struct file_operations ucm_fops = {
 	.poll    = ib_ucm_poll,
 };
 
-static struct class ucm_class = {
-	.name    = "infiniband_cm",
-	.release = ib_ucm_release_class_dev
-};
-
 static ssize_t show_ibdev(struct class_device *class_dev, char *buf)
 {
 	struct ib_ucm_device *dev;
@@ -1257,9 +1261,10 @@ static void ib_ucm_add_one(struct ib_device *device)
 	if (cdev_add(&ucm_dev->dev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1))
 		goto err;
 
-	ucm_dev->class_dev.class = &ucm_class;
+	ucm_dev->class_dev.class = &cm_class;
 	ucm_dev->class_dev.dev = device->dma_device;
 	ucm_dev->class_dev.devt = ucm_dev->dev.dev;
+	ucm_dev->class_dev.release = ucm_release_class_dev;
 	snprintf(ucm_dev->class_dev.class_id, BUS_ID_SIZE, "ucm%d",
 		 ucm_dev->devnum);
 	if (class_device_register(&ucm_dev->class_dev))
@@ -1306,40 +1311,34 @@ static int __init ib_ucm_init(void)
 				     "infiniband_cm");
 	if (ret) {
 		printk(KERN_ERR "ucm: couldn't register device number\n");
-		goto err;
+		goto error1;
 	}
 
-	ret = class_register(&ucm_class);
-	if (ret) {
-		printk(KERN_ERR "ucm: couldn't create class infiniband_cm\n");
-		goto err_chrdev;
-	}
-
-	ret = class_create_file(&ucm_class, &class_attr_abi_version);
+	ret = class_create_file(&cm_class, &class_attr_abi_version);
 	if (ret) {
 		printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
-		goto err_class;
+		goto error2;
 	}
 
 	ret = ib_register_client(&ucm_client);
 	if (ret) {
 		printk(KERN_ERR "ucm: couldn't register client\n");
-		goto err_class;
+		goto error3;
 	}
 	return 0;
 
-err_class:
-	class_unregister(&ucm_class);
-err_chrdev:
+error3:
+	class_remove_file(&cm_class, &class_attr_abi_version);
+error2:
 	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
-err:
+error1:
 	return ret;
 }
 
 static void __exit ib_ucm_cleanup(void)
 {
 	ib_unregister_client(&ucm_client);
-	class_unregister(&ucm_class);
+	class_remove_file(&cm_class, &class_attr_abi_version);
 	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
 	idr_destroy(&ctx_id_table);
 }
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index ef52dbb..31f3671 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/completion.h>
+#include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/poll.h>
 #include <linux/idr.h>
@@ -80,9 +81,7 @@ struct ucma_multicast {
 
 	u64			uid;
 	struct list_head	list;
-	struct sockaddr		addr;
-	u8			pad[sizeof(struct sockaddr_in6) -
-				    sizeof(struct sockaddr)];
+	struct sockaddr_storage	addr;
 };
 
 struct ucma_event {
@@ -602,18 +601,18 @@ static ssize_t ucma_query_route(struct ucma_file *file,
 		return PTR_ERR(ctx);
 
 	memset(&resp, 0, sizeof resp);
-	addr = &ctx->cm_id->route.addr.src_addr;
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
 	memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
 				     sizeof(struct sockaddr_in) :
 				     sizeof(struct sockaddr_in6));
-	addr = &ctx->cm_id->route.addr.dst_addr;
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
 	memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
 				     sizeof(struct sockaddr_in) :
 				     sizeof(struct sockaddr_in6));
 	if (!ctx->cm_id->device)
 		goto out;
 
-	resp.node_guid = ctx->cm_id->device->node_guid;
+	resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
 	resp.port_num = ctx->cm_id->port_num;
 	switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) {
 	case RDMA_TRANSPORT_IB:
@@ -905,14 +904,14 @@ static ssize_t ucma_join_multicast(struct ucma_file *file,
 
 	mutex_lock(&file->mut);
 	mc = ucma_alloc_multicast(ctx);
-	if (IS_ERR(mc)) {
-		ret = PTR_ERR(mc);
+	if (mc == NULL) {
+		ret = -ENOMEM;
 		goto err1;
 	}
 
 	mc->uid = cmd.uid;
 	memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr);
-	ret = rdma_join_multicast(ctx->cm_id, &mc->addr, mc);
+	ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
 	if (ret)
 		goto err2;
 
@@ -928,7 +927,7 @@ static ssize_t ucma_join_multicast(struct ucma_file *file,
 	return 0;
 
 err3:
-	rdma_leave_multicast(ctx->cm_id, &mc->addr);
+	rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
 	ucma_cleanup_mc_events(mc);
 err2:
 	mutex_lock(&mut);
@@ -974,7 +973,7 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
 		goto out;
 	}
 
-	rdma_leave_multicast(mc->ctx->cm_id, &mc->addr);
+	rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
 	mutex_lock(&mc->ctx->file->mut);
 	ucma_cleanup_mc_events(mc);
 	list_del(&mc->list);
@@ -991,6 +990,96 @@ out:
 	return ret;
 }
 
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	/* Acquire mutex's based on pointer comparison to prevent deadlock. */
+	if (file1 < file2) {
+		mutex_lock(&file1->mut);
+		mutex_lock(&file2->mut);
+	} else {
+		mutex_lock(&file2->mut);
+		mutex_lock(&file1->mut);
+	}
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	if (file1 < file2) {
+		mutex_unlock(&file2->mut);
+		mutex_unlock(&file1->mut);
+	} else {
+		mutex_unlock(&file1->mut);
+		mutex_unlock(&file2->mut);
+	}
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_migrate_id cmd;
+	struct rdma_ucm_migrate_resp resp;
+	struct ucma_context *ctx;
+	struct file *filp;
+	struct ucma_file *cur_file;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	/* Get current fd to protect against it being closed */
+	filp = fget(cmd.fd);
+	if (!filp)
+		return -ENOENT;
+
+	/* Validate current fd and prevent destruction of id. */
+	ctx = ucma_get_ctx(filp->private_data, cmd.id);
+	if (IS_ERR(ctx)) {
+		ret = PTR_ERR(ctx);
+		goto file_put;
+	}
+
+	cur_file = ctx->file;
+	if (cur_file == new_file) {
+		resp.events_reported = ctx->events_reported;
+		goto response;
+	}
+
+	/*
+	 * Migrate events between fd's, maintaining order, and avoiding new
+	 * events being added before existing events.
+	 */
+	ucma_lock_files(cur_file, new_file);
+	mutex_lock(&mut);
+
+	list_move_tail(&ctx->list, &new_file->ctx_list);
+	ucma_move_events(ctx, new_file);
+	ctx->file = new_file;
+	resp.events_reported = ctx->events_reported;
+
+	mutex_unlock(&mut);
+	ucma_unlock_files(cur_file, new_file);
+
+response:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+file_put:
+	fput(filp);
+	return ret;
+}
+
 static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
 				   const char __user *inbuf,
 				   int in_len, int out_len) = {
@@ -1012,6 +1101,7 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
 	[RDMA_USER_CM_CMD_NOTIFY]	= ucma_notify,
 	[RDMA_USER_CM_CMD_JOIN_MCAST]	= ucma_join_multicast,
 	[RDMA_USER_CM_CMD_LEAVE_MCAST]	= ucma_leave_multicast,
+	[RDMA_USER_CM_CMD_MIGRATE_ID]	= ucma_migrate_id
 };
 
 static ssize_t ucma_write(struct file *filp, const char __user *buf,
@@ -1056,6 +1146,14 @@ static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait)
 	return mask;
 }
 
+/*
+ * ucma_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
 static int ucma_open(struct inode *inode, struct file *filp)
 {
 	struct ucma_file *file;
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 997c07d..9d74fb8 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -29,8 +29,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ud_header.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/errno.h>
@@ -243,6 +241,36 @@ void ib_ud_header_init(int     		    payload_bytes,
 EXPORT_SYMBOL(ib_ud_header_init);
 
 /**
+ * ib_lrh_header_pack - Pack LRH header struct into wire format
+ * @lrh:unpacked LRH header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_lrh_header_pack() packs the LRH header structure @lrh into
+ * wire format in the buffer @buf.
+ */
+int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf)
+{
+	ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf);
+	return 0;
+}
+EXPORT_SYMBOL(ib_lrh_header_pack);
+
+/**
+ * ib_lrh_header_unpack - Unpack LRH structure from wire format
+ * @lrh:unpacked LRH header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_lrh_header_unpack() unpacks the LRH header structure from
+ * wire format (in buf) into @lrh.
+ */
+int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh);
+	return 0;
+}
+EXPORT_SYMBOL(ib_lrh_header_unpack);
+
+/**
  * ib_ud_header_pack - Pack UD header struct into wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index dd4b7ad..bc0245c 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: uverbs_mem.c 2743 2005-06-28 22:27:59Z roland $
  */
 
 #include <linux/mm.h>
@@ -42,10 +40,6 @@
 
 #include "uverbs.h"
 
-static int allow_weak_ordering;
-module_param(allow_weak_ordering, bool, 0444);
-MODULE_PARM_DESC(allow_weak_ordering,  "Allow weak ordering for data registered memory");
-
 #define IB_UMEM_MAX_PAGE_CHUNK						\
 	((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) /	\
 	 ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] -	\
@@ -101,21 +95,14 @@ static void dma_unmap_sg_ia64(struct ib_device *ibdev,
 
 #endif
 
-static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty, int dmasync)
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
 	struct ib_umem_chunk *chunk, *tmp;
 	int i;
 
-	DEFINE_DMA_ATTRS(attrs);
-
-	if (dmasync)
-		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
-	else if (allow_weak_ordering)
-		dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
-
 	list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
-		ib_dma_unmap_sg_attrs(dev, chunk->page_list,
-				      chunk->nents, DMA_BIDIRECTIONAL, &attrs);
+		ib_dma_unmap_sg(dev, chunk->page_list,
+				chunk->nents, DMA_BIDIRECTIONAL);
 		for (i = 0; i < chunk->nents; ++i) {
 			struct page *page = sg_page(&chunk->page_list[i]);
 
@@ -136,7 +123,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
  * @access: IB_ACCESS_xxx flags for memory being pinned
  * @dmasync: flush in-flight DMA when the memory region is written
  */
-static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 			    size_t size, int access, int dmasync)
 {
 	struct ib_umem *umem;
@@ -154,9 +141,6 @@ static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long
 
 	if (dmasync)
 		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
-	else if (allow_weak_ordering)
-		dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
-
 
 	if (!can_do_mlock())
 		return ERR_PTR(-EPERM);
@@ -214,7 +198,7 @@ static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long
 	ret = 0;
 	while (npages) {
 		ret = get_user_pages(current, current->mm, cur_base,
-				     min_t(int, npages,
+				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
 				     1, !umem->writable, page_list, vma_list);
 
@@ -268,7 +252,7 @@ static struct ib_umem *__ib_umem_get(struct ib_ucontext *context, unsigned long
 
 out:
 	if (ret < 0) {
-		__ib_umem_release(context->device, umem, 0, dmasync);
+		__ib_umem_release(context->device, umem, 0);
 		kfree(umem);
 	} else
 		current->mm->locked_vm = locked;
@@ -280,21 +264,8 @@ out:
 
 	return ret < 0 ? ERR_PTR(ret) : umem;
 }
-
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
-			    size_t size, int access)
-{
-	return __ib_umem_get(context, addr, size, access, 0);
-}
 EXPORT_SYMBOL(ib_umem_get);
 
-struct ib_umem *ib_umem_get_dmasync(struct ib_ucontext *context, unsigned long addr,
-			    size_t size, int access)
-{
-	return __ib_umem_get(context, addr, size, access, 1);
-}
-EXPORT_SYMBOL(ib_umem_get_dmasync);
-
 static void ib_umem_account(struct work_struct *work)
 {
 	struct ib_umem *umem = container_of(work, struct ib_umem, work);
@@ -310,13 +281,13 @@ static void ib_umem_account(struct work_struct *work)
  * ib_umem_release - release memory pinned with ib_umem_get
  * @umem: umem struct to release
  */
-static void ib_umem_release_attr(struct ib_umem *umem, int dmasync)
+void ib_umem_release(struct ib_umem *umem)
 {
 	struct ib_ucontext *context = umem->context;
 	struct mm_struct *mm;
 	unsigned long diff;
 
-	__ib_umem_release(umem->context->device, umem, 1, dmasync);
+	__ib_umem_release(umem->context->device, umem, 1);
 
 	mm = get_task_mm(current);
 	if (!mm) {
@@ -351,19 +322,8 @@ static void ib_umem_release_attr(struct ib_umem *umem, int dmasync)
 	mmput(mm);
 	kfree(umem);
 }
-
-void ib_umem_release(struct ib_umem *umem)
-{
-	return ib_umem_release_attr(umem, 0);
-}
 EXPORT_SYMBOL(ib_umem_release);
 
-void ib_umem_release_dmasync(struct ib_umem *umem)
-{
-	return ib_umem_release_attr(umem, 1);
-}
-EXPORT_SYMBOL(ib_umem_release_dmasync);
-
 int ib_umem_page_count(struct ib_umem *umem)
 {
 	struct ib_umem_chunk *chunk;
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index b53eac4..895b231 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -30,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: user_mad.c 5596 2006-03-03 01:00:07Z sean.hefty $
  */
 
 #include <linux/module.h>
@@ -42,12 +41,12 @@
 #include <linux/cdev.h>
 #include <linux/dma-mapping.h>
 #include <linux/poll.h>
-#include <linux/rwsem.h>
+#include <linux/mutex.h>
 #include <linux/kref.h>
 #include <linux/compat.h>
+#include <linux/semaphore.h>
 
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_user_mad.h>
@@ -94,7 +93,7 @@ struct ib_umad_port {
 	struct class_device   *sm_class_dev;
 	struct semaphore       sm_sem;
 
-	struct rw_semaphore    mutex;
+	struct mutex	       file_mutex;
 	struct list_head       file_list;
 
 	struct ib_device      *ib_dev;
@@ -110,11 +109,11 @@ struct ib_umad_device {
 };
 
 struct ib_umad_file {
+	struct mutex		mutex;
 	struct ib_umad_port    *port;
 	struct list_head	recv_list;
 	struct list_head	send_list;
 	struct list_head	port_list;
-	spinlock_t		recv_lock;
 	spinlock_t		send_lock;
 	wait_queue_head_t	recv_wait;
 	struct ib_mad_agent    *agent[IB_UMAD_MAX_AGENTS];
@@ -156,7 +155,7 @@ static int hdr_size(struct ib_umad_file *file)
 		sizeof (struct ib_user_mad_hdr_old);
 }
 
-/* caller must hold port->mutex at least for reading */
+/* caller must hold file->mutex */
 static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
 {
 	return file->agents_dead ? NULL : file->agent[id];
@@ -168,32 +167,30 @@ static int queue_packet(struct ib_umad_file *file,
 {
 	int ret = 1;
 
-	down_read(&file->port->mutex);
+	mutex_lock(&file->mutex);
 
 	for (packet->mad.hdr.id = 0;
 	     packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
 	     packet->mad.hdr.id++)
 		if (agent == __get_agent(file, packet->mad.hdr.id)) {
-			spin_lock_irq(&file->recv_lock);
 			list_add_tail(&packet->list, &file->recv_list);
-			spin_unlock_irq(&file->recv_lock);
 			wake_up_interruptible(&file->recv_wait);
 			ret = 0;
 			break;
 		}
 
-	up_read(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 
 	return ret;
 }
 
 static void dequeue_send(struct ib_umad_file *file,
 			 struct ib_umad_packet *packet)
- {
+{
 	spin_lock_irq(&file->send_lock);
 	list_del(&packet->list);
 	spin_unlock_irq(&file->send_lock);
- }
+}
 
 static void send_handler(struct ib_mad_agent *agent,
 			 struct ib_mad_send_wc *send_wc)
@@ -341,10 +338,10 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
 	if (count < hdr_size(file))
 		return -EINVAL;
 
-	spin_lock_irq(&file->recv_lock);
+	mutex_lock(&file->mutex);
 
 	while (list_empty(&file->recv_list)) {
-		spin_unlock_irq(&file->recv_lock);
+		mutex_unlock(&file->mutex);
 
 		if (filp->f_flags & O_NONBLOCK)
 			return -EAGAIN;
@@ -353,13 +350,13 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
 					     !list_empty(&file->recv_list)))
 			return -ERESTARTSYS;
 
-		spin_lock_irq(&file->recv_lock);
+		mutex_lock(&file->mutex);
 	}
 
 	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
 	list_del(&packet->list);
 
-	spin_unlock_irq(&file->recv_lock);
+	mutex_unlock(&file->mutex);
 
 	if (packet->recv_wc)
 		ret = copy_recv_mad(file, buf, packet, count);
@@ -368,9 +365,9 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
 
 	if (ret < 0) {
 		/* Requeue packet */
-		spin_lock_irq(&file->recv_lock);
+		mutex_lock(&file->mutex);
 		list_add(&packet->list, &file->recv_list);
-		spin_unlock_irq(&file->recv_lock);
+		mutex_unlock(&file->mutex);
 	} else {
 		if (packet->recv_wc)
 			ib_free_recv_mad(packet->recv_wc);
@@ -481,7 +478,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 		goto err;
 	}
 
-	down_read(&file->port->mutex);
+	mutex_lock(&file->mutex);
 
 	agent = __get_agent(file, packet->mad.hdr.id);
 	if (!agent) {
@@ -577,7 +574,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 	if (ret)
 		goto err_send;
 
-	up_read(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 	return count;
 
 err_send:
@@ -587,7 +584,7 @@ err_msg:
 err_ah:
 	ib_destroy_ah(ah);
 err_up:
-	up_read(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 err:
 	kfree(packet);
 	return ret;
@@ -613,11 +610,12 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
 {
 	struct ib_user_mad_reg_req ureq;
 	struct ib_mad_reg_req req;
-	struct ib_mad_agent *agent;
+	struct ib_mad_agent *agent = NULL;
 	int agent_id;
 	int ret;
 
-	down_write(&file->port->mutex);
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
 
 	if (!file->port->ib_dev) {
 		ret = -EPIPE;
@@ -666,13 +664,13 @@ found:
 				      send_handler, recv_handler, file);
 	if (IS_ERR(agent)) {
 		ret = PTR_ERR(agent);
+		agent = NULL;
 		goto out;
 	}
 
 	if (put_user(agent_id,
 		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
 		ret = -EFAULT;
-		ib_unregister_mad_agent(agent);
 		goto out;
 	}
 
@@ -690,7 +688,13 @@ found:
 	ret = 0;
 
 out:
-	up_write(&file->port->mutex);
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
 	return ret;
 }
 
@@ -703,7 +707,8 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
 	if (get_user(id, arg))
 		return -EFAULT;
 
-	down_write(&file->port->mutex);
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
 
 	if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
 		ret = -EINVAL;
@@ -714,11 +719,13 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
 	file->agent[id] = NULL;
 
 out:
-	up_write(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 
 	if (agent)
 		ib_unregister_mad_agent(agent);
 
+	mutex_unlock(&file->port->file_mutex);
+
 	return ret;
 }
 
@@ -726,12 +733,12 @@ static long ib_umad_enable_pkey(struct ib_umad_file *file)
 {
 	int ret = 0;
 
-	down_write(&file->port->mutex);
+	mutex_lock(&file->mutex);
 	if (file->already_used)
 		ret = -EINVAL;
 	else
 		file->use_pkey_index = 1;
-	up_write(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 
 	return ret;
 }
@@ -768,6 +775,19 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
 }
 #endif
 
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ *  - umad_port[] accesses are protected by port_lock, the
+ *    ib_umad_port structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - the ioctl method does not affect any global state outside of the
+ *    file structure being operated on;
+ *  - the port is added to umad_port[] as the last part of module
+ *    initialization so the open method will either immediately run
+ *    -ENXIO, or all required initialization will be done.
+ */
 static int ib_umad_open(struct inode *inode, struct file *filp)
 {
 	struct ib_umad_port *port;
@@ -783,7 +803,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 	if (!port)
 		return -ENXIO;
 
-	down_write(&port->mutex);
+	mutex_lock(&port->file_mutex);
 
 	if (!port->ib_dev) {
 		ret = -ENXIO;
@@ -797,7 +817,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 		goto out;
 	}
 
-	spin_lock_init(&file->recv_lock);
+	mutex_init(&file->mutex);
 	spin_lock_init(&file->send_lock);
 	INIT_LIST_HEAD(&file->recv_list);
 	INIT_LIST_HEAD(&file->send_list);
@@ -809,7 +829,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 	list_add_tail(&file->port_list, &port->file_list);
 
 out:
-	up_write(&port->mutex);
+	mutex_unlock(&port->file_mutex);
 	return ret;
 }
 
@@ -821,7 +841,8 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
 	int already_dead;
 	int i;
 
-	down_write(&file->port->mutex);
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
 
 	already_dead = file->agents_dead;
 	file->agents_dead = 1;
@@ -834,14 +855,14 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
 
 	list_del(&file->port_list);
 
-	downgrade_write(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 
 	if (!already_dead)
 		for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
 			if (file->agent[i])
 				ib_unregister_mad_agent(file->agent[i]);
 
-	up_read(&file->port->mutex);
+	mutex_unlock(&file->port->file_mutex);
 
 	kfree(file);
 	kref_put(&dev->ref, ib_umad_release_dev);
@@ -914,10 +935,10 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
 	};
 	int ret = 0;
 
-	down_write(&port->mutex);
+	mutex_lock(&port->file_mutex);
 	if (port->ib_dev)
 		ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
-	up_write(&port->mutex);
+	mutex_unlock(&port->file_mutex);
 
 	up(&port->sm_sem);
 
@@ -981,7 +1002,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
 	port->ib_dev   = device;
 	port->port_num = port_num;
 	init_MUTEX(&port->sm_sem);
-	init_rwsem(&port->mutex);
+	mutex_init(&port->file_mutex);
 	INIT_LIST_HEAD(&port->file_list);
 
 	port->dev = cdev_alloc();
@@ -1052,6 +1073,7 @@ err_cdev:
 static void ib_umad_kill_port(struct ib_umad_port *port)
 {
 	struct ib_umad_file *file;
+	int already_dead;
 	int id;
 
 	class_set_devdata(port->class_dev,    NULL);
@@ -1067,42 +1089,22 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
 	umad_port[port->dev_num] = NULL;
 	spin_unlock(&port_lock);
 
-	down_write(&port->mutex);
+	mutex_lock(&port->file_mutex);
 
 	port->ib_dev = NULL;
 
-	/*
-	 * Now go through the list of files attached to this port and
-	 * unregister all of their MAD agents.  We need to hold
-	 * port->mutex while doing this to avoid racing with
-	 * ib_umad_close(), but we can't hold the mutex for writing
-	 * while calling ib_unregister_mad_agent(), since that might
-	 * deadlock by calling back into queue_packet().  So we
-	 * downgrade our lock to a read lock, and then drop and
-	 * reacquire the write lock for the next iteration.
-	 *
-	 * We do list_del_init() on the file's list_head so that the
-	 * list_del in ib_umad_close() is still OK, even after the
-	 * file is removed from the list.
-	 */
-	while (!list_empty(&port->file_list)) {
-		file = list_entry(port->file_list.next, struct ib_umad_file,
-				  port_list);
-
+	list_for_each_entry(file, &port->file_list, port_list) {
+		mutex_lock(&file->mutex);
+		already_dead = file->agents_dead;
 		file->agents_dead = 1;
-		list_del_init(&file->port_list);
-
-		downgrade_write(&port->mutex);
+		mutex_unlock(&file->mutex);
 
 		for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
 			if (file->agent[id])
 				ib_unregister_mad_agent(file->agent[id]);
-
-		up_read(&port->mutex);
-		down_write(&port->mutex);
 	}
 
-	up_write(&port->mutex);
+	mutex_unlock(&port->file_mutex);
 
 	clear_bit(port->dev_num, dev_map);
 }
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index c75eb6c..d62fcad 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: uverbs.h 2559 2005-06-06 19:43:16Z roland $
  */
 
 #ifndef UVERBS_H
@@ -81,13 +79,13 @@ struct ib_uverbs_device {
 
 struct ib_uverbs_event_file {
 	struct kref				ref;
-	struct file			       *file;
 	struct ib_uverbs_file		       *uverbs_file;
 	spinlock_t				lock;
-	int					is_async;
 	wait_queue_head_t			poll_wait;
 	struct fasync_struct		       *async_queue;
 	struct list_head			event_list;
+	int					is_async;
+	int					is_closed;
 };
 
 struct ib_uverbs_file {
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 9e98cec..56feab6 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -31,8 +31,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: uverbs_cmd.c 2708 2005-06-24 17:27:21Z roland $
  */
 
 #include <linux/file.h>
@@ -919,7 +917,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
 		resp->wc[i].opcode 	   = wc[i].opcode;
 		resp->wc[i].vendor_err 	   = wc[i].vendor_err;
 		resp->wc[i].byte_len 	   = wc[i].byte_len;
-		resp->wc[i].imm_data 	   = (__u32 __force) wc[i].imm_data;
+		resp->wc[i].ex.imm_data    = (__u32 __force) wc[i].ex.imm_data;
 		resp->wc[i].qp_num 	   = wc[i].qp->qp_num;
 		resp->wc[i].src_qp 	   = wc[i].src_qp;
 		resp->wc[i].wc_flags 	   = wc[i].wc_flags;
@@ -1463,7 +1461,6 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
 		next->num_sge    = user_wr->num_sge;
 		next->opcode     = user_wr->opcode;
 		next->send_flags = user_wr->send_flags;
-		next->imm_data   = (__be32 __force) user_wr->imm_data;
 
 		if (is_ud) {
 			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
@@ -1476,14 +1473,24 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
 			next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
 		} else {
 			switch (next->opcode) {
-			case IB_WR_RDMA_WRITE:
 			case IB_WR_RDMA_WRITE_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+			case IB_WR_RDMA_WRITE:
 			case IB_WR_RDMA_READ:
 				next->wr.rdma.remote_addr =
 					user_wr->wr.rdma.remote_addr;
 				next->wr.rdma.rkey        =
 					user_wr->wr.rdma.rkey;
 				break;
+			case IB_WR_SEND_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+				break;
+			case IB_WR_SEND_WITH_INV:
+				next->ex.invalidate_rkey =
+					user_wr->ex.invalidate_rkey;
+				break;
 			case IB_WR_ATOMIC_CMP_AND_SWP:
 			case IB_WR_ATOMIC_FETCH_AND_ADD:
 				next->wr.atomic.remote_addr =
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index cfa567b..68b75e7 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -32,8 +32,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: uverbs_main.c 2733 2005-06-28 19:14:34Z roland $
  */
 
 #include <linux/module.h>
@@ -352,7 +350,7 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 	struct ib_uverbs_event *entry, *tmp;
 
 	spin_lock_irq(&file->lock);
-	file->file = NULL;
+	file->is_closed = 1;
 	list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
 		if (entry->counter)
 			list_del(&entry->obj_list);
@@ -390,7 +388,7 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
 		return;
 
 	spin_lock_irqsave(&file->lock, flags);
-	if (!file->file) {
+	if (file->is_closed) {
 		spin_unlock_irqrestore(&file->lock, flags);
 		return;
 	}
@@ -423,7 +421,7 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
 	unsigned long flags;
 
 	spin_lock_irqsave(&file->async_file->lock, flags);
-	if (!file->async_file->file) {
+	if (file->async_file->is_closed) {
 		spin_unlock_irqrestore(&file->async_file->lock, flags);
 		return;
 	}
@@ -509,6 +507,7 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 	ev_file->uverbs_file = uverbs_file;
 	ev_file->async_queue = NULL;
 	ev_file->is_async    = is_async;
+	ev_file->is_closed   = 0;
 
 	*fd = get_unused_fd();
 	if (*fd < 0) {
@@ -522,8 +521,6 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 		goto err_fd;
 	}
 
-	ev_file->file      = filp;
-
 	/*
 	 * fops_get() can't fail here, because we're coming from a
 	 * system call on a uverbs file, which will already have a
@@ -616,6 +613,18 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
 		return file->device->ib_dev->mmap(file->ucontext, vma);
 }
 
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ *  - dev_table[] accesses are protected by map_lock, the
+ *    ib_uverbs_device structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - there is no ioctl method to race against;
+ *  - the device is added to dev_table[] as the last part of module
+ *    initialization, the open method will either immediately run
+ *    -ENXIO, or all required initialization will be done.
+ */
 static int ib_uverbs_open(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_device *dev;
@@ -657,7 +666,6 @@ err_module:
 
 err:
 	kref_put(&dev->ref, ib_uverbs_release_dev);
-
 	return ret;
 }
 
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 84709ed..a7da9be 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -34,8 +34,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: verbs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/errno.h>
@@ -248,7 +246,9 @@ int ib_modify_srq(struct ib_srq *srq,
 		  struct ib_srq_attr *srq_attr,
 		  enum ib_srq_attr_mask srq_attr_mask)
 {
-	return srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL);
+	return srq->device->modify_srq ?
+		srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+		-ENOSYS;
 }
 EXPORT_SYMBOL(ib_modify_srq);
 
@@ -315,7 +315,6 @@ static const struct {
 } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
 	[IB_QPS_RESET] = {
 		[IB_QPS_RESET] = { .valid = 1 },
-		[IB_QPS_ERR]   = { .valid = 1 },
 		[IB_QPS_INIT]  = {
 			.valid = 1,
 			.req_param = {
@@ -679,6 +678,9 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
 {
 	struct ib_mr *mr;
 
+	if (!pd->device->reg_phys_mr)
+		return ERR_PTR(-ENOSYS);
+
 	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
 				     mr_access_flags, iova_start);
 
@@ -750,6 +752,52 @@ int ib_dereg_mr(struct ib_mr *mr)
 }
 EXPORT_SYMBOL(ib_dereg_mr);
 
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->alloc_fast_reg_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+							  int max_page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	if (!device->alloc_fast_reg_page_list)
+		return ERR_PTR(-ENOSYS);
+
+	page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+	if (!IS_ERR(page_list)) {
+		page_list->device = device;
+		page_list->max_page_list_len = max_page_list_len;
+	}
+
+	return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
 /* Memory windows */
 
 struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
diff --git a/drivers/infiniband/core/writeback.c b/drivers/infiniband/core/writeback.c
new file mode 100644
index 0000000..7f8ebbe
--- /dev/null
+++ b/drivers/infiniband/core/writeback.c
@@ -0,0 +1,106 @@
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/module.h>
+
+int backport_write_cache_pages(struct address_space *mapping,
+                      struct writeback_control *wbc, backport_writepage_t writepage,
+                      void *data)
+{
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        int nr_pages;
+        pgoff_t index;
+        pgoff_t end;            /* Inclusive */
+        int scanned = 0;
+        int range_whole = 0;
+        long nr_to_write = wbc->nr_to_write;
+
+        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                wbc->encountered_congestion = 1;
+                return 0;
+        }
+
+        pagevec_init(&pvec, 0);
+        if (wbc->range_cyclic) {
+                index = mapping->writeback_index; /* Start from prev offset */
+                end = -1;
+        } else {
+                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                        range_whole = 1;
+                scanned = 1;
+        }
+retry:
+        while (!done && (index <= end) &&
+               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                              PAGECACHE_TAG_DIRTY,
+                                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+                unsigned i;
+
+                scanned = 1;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+
+                        /*
+                         * At this point we hold neither mapping->tree_lock nor
+                         * lock on the page itself: the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or even
+                         * swizzled back from swapper_space to tmpfs file
+                         * mapping
+                         */
+                        lock_page(page);
+
+                        if (unlikely(page->mapping != mapping)) {
+                                unlock_page(page);
+                                continue;
+                        }
+
+                        if (!wbc->range_cyclic && page->index > end) {
+                                done = 1;
+                                unlock_page(page);
+                                continue;
+                        }
+
+                        if (wbc->sync_mode != WB_SYNC_NONE)
+                                wait_on_page_writeback(page);
+
+                        if (PageWriteback(page) ||
+                            !clear_page_dirty_for_io(page)) {
+                                unlock_page(page);
+                                continue;
+                        }
+
+                        ret = (*writepage)(page, wbc, data);
+
+                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+                                unlock_page(page);
+                                ret = 0;
+                        }
+                        if (ret || (--nr_to_write <= 0))
+                                done = 1;
+                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                                wbc->encountered_congestion = 1;
+                                done = 1;
+                        }
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        if (!scanned && !done) {
+                /*
+                 * We hit the last page and there is more work to be done: wrap
+                 * back to the start of the file
+                 */
+                scanned = 1;
+                index = 0;
+                goto retry;
+        }
+        return ret;
+}
+EXPORT_SYMBOL(backport_write_cache_pages);