Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > fc11cd6e1c513a17304da94a5390f3cd > files > 2710

kernel-2.6.18-194.11.1.el5.src.rpm

From: Doug Ledford <dledford@redhat.com>
Date: Tue, 14 Apr 2009 15:23:40 -0400
Subject: [openib] IPoIB: update to OFED 1.4.1-rc3
Message-id: 1239737023-31222-14-git-send-email-dledford@redhat.com
O-Subject: [Patch RHEL5.4 13/16] [IPoIB] Update to OFED 1.4.1-rc3 version
Bugzilla: 434779 466086

Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
index 1f76bad..9d9a9dc 100644
--- a/drivers/infiniband/ulp/ipoib/Kconfig
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -1,6 +1,7 @@
 config INFINIBAND_IPOIB
 	tristate "IP-over-InfiniBand"
 	depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+	select INET_LRO
 	---help---
 	  Support for the IP-over-InfiniBand protocol (IPoIB). This
 	  transports IP packets over InfiniBand so you can use your IB
@@ -10,16 +11,17 @@ config INFINIBAND_IPOIB
 
 config INFINIBAND_IPOIB_CM
 	bool "IP-over-InfiniBand Connected Mode support"
-	depends on INFINIBAND_IPOIB && EXPERIMENTAL
+	depends on INFINIBAND_IPOIB
 	default n
 	---help---
-	  This option enables experimental support for IPoIB connected mode.
-	  After enabling this option, you need to switch to connected mode through
-	  /sys/class/net/ibXXX/mode to actually create connections, and then increase
-	  the interface MTU with e.g. ifconfig ib0 mtu 65520.
+	  This option enables support for IPoIB connected mode.  After
+	  enabling this option, you need to switch to connected mode
+	  through /sys/class/net/ibXXX/mode to actually create
+	  connections, and then increase the interface MTU with
+	  e.g. ifconfig ib0 mtu 65520.
 
-	  WARNING: Enabling connected mode will trigger some
-	  packet drops for multicast and UD mode traffic from this interface,
+	  WARNING: Enabling connected mode will trigger some packet
+	  drops for multicast and UD mode traffic from this interface,
 	  unless you limit mtu for these destinations to 2044.
 
 config INFINIBAND_IPOIB_DEBUG
@@ -32,9 +34,10 @@ config INFINIBAND_IPOIB_DEBUG
 	  debug_level and mcast_debug_level module parameters (which
 	  can also be set after the driver is loaded through sysfs).
 
-	  This option also creates an "ipoib_debugfs," which can be
-	  mounted to expose debugging information about IB multicast
-	  groups used by the IPoIB driver.
+	  This option also creates a directory tree under ipoib/ in
+	  debugfs, which contains files that expose debugging
+	  information about IB multicast groups used by the IPoIB
+	  driver.
 
 config INFINIBAND_IPOIB_DEBUG_DATA
 	bool "IP-over-InfiniBand data path debugging"
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
index 83488ee..9e92e1f 100644
--- a/drivers/infiniband/ulp/ipoib/Makefile
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -1,11 +1,11 @@
-obj-$(CONFIG_INFINIBAND_IPOIB)			+= ib_ipoib.o
+obj-$(CONFIG_INFINIBAND_IPOIB)			+= ib_ipoib.o ipoib_helper.o
 
 ib_ipoib-y					:= ipoib_main.o \
 						   ipoib_ib.o \
 						   ipoib_multicast.o \
 						   ipoib_verbs.o \
 						   ipoib_vlan.o \
-						   ipoib_etool.o
+						   ipoib_ethtool.o
 ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o
 ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index e6c5857..af0abcb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib.h 1358 2004-12-17 22:00:11Z roland $
  */
 
 #ifndef _IPOIB_H
@@ -52,56 +50,61 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_sa.h>
+#include <linux/inet_lro.h>
 
 /* constants */
 
+enum ipoib_flush_level {
+	IPOIB_FLUSH_LIGHT,
+	IPOIB_FLUSH_NORMAL,
+	IPOIB_FLUSH_HEAVY
+};
+
 enum {
-	IPOIB_ENCAP_LEN 	  = 4,
+	IPOIB_ENCAP_LEN		  = 4,
 
- 	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
- 	IPOIB_UD_RX_SG		  = 2, /* for 4K MTU */
+	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+	IPOIB_UD_RX_SG		  = 2, /* max buffer needed for 4K mtu */
 
-	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header to 16 */
-	IPOIB_CM_BUF_SIZE         = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
-	IPOIB_CM_HEAD_SIZE 	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
-	IPOIB_CM_RX_SG            = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
-	IPOIB_RX_RING_SIZE 	  = 256,
-	IPOIB_TX_RING_SIZE 	  = 128,
+	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
+	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
+	IPOIB_CM_HEAD_SIZE	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
+	IPOIB_CM_RX_SG		  = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
+	IPOIB_RX_RING_SIZE	  = 256,
+	IPOIB_TX_RING_SIZE	  = 128,
 	IPOIB_MAX_QUEUE_SIZE	  = 8192,
 	IPOIB_MIN_QUEUE_SIZE	  = 2,
 	IPOIB_CM_MAX_CONN_QP	  = 4096,
 
-	IPOIB_NUM_WC 		  = 4,
+	IPOIB_NUM_WC		  = 4,
 
 	IPOIB_MAX_PATH_REC_QUEUE  = 3,
-	IPOIB_MAX_MCAST_QUEUE     = 3,
-
-	IPOIB_FLAG_OPER_UP 	  = 0,
-	IPOIB_FLAG_INITIALIZED    = 1,
-	IPOIB_FLAG_ADMIN_UP 	  = 2,
-	IPOIB_PKEY_ASSIGNED 	  = 3,
-	IPOIB_PKEY_STOP 	  = 4,
-	IPOIB_FLAG_SUBINTERFACE   = 5,
-	IPOIB_MCAST_RUN 	  = 6,
-	IPOIB_STOP_REAPER         = 7,
-	IPOIB_MCAST_STARTED       = 8,
-	IPOIB_FLAG_ADMIN_CM 	  = 9,
+	IPOIB_MAX_MCAST_QUEUE	  = 3,
+
+	IPOIB_FLAG_OPER_UP	  = 0,
+	IPOIB_FLAG_INITIALIZED	  = 1,
+	IPOIB_FLAG_ADMIN_UP	  = 2,
+	IPOIB_PKEY_ASSIGNED	  = 3,
+	IPOIB_PKEY_STOP		  = 4,
+	IPOIB_FLAG_SUBINTERFACE	  = 5,
+	IPOIB_MCAST_RUN		  = 6,
+	IPOIB_STOP_REAPER	  = 7,
+	IPOIB_FLAG_ADMIN_CM	  = 9,
 	IPOIB_FLAG_UMCAST	  = 10,
-	IPOIB_FLAG_CSUM           = 11,
-	IPOIB_FLAG_TIME_ON	  = 12,
+	IPOIB_FLAG_CSUM		  = 11,
 
 	IPOIB_MAX_BACKOFF_SECONDS = 16,
 
-	IPOIB_MCAST_FLAG_FOUND 	  = 0,	/* used in set_multicast_list */
+	IPOIB_MCAST_FLAG_FOUND	  = 0,	/* used in set_multicast_list */
 	IPOIB_MCAST_FLAG_SENDONLY = 1,
-	IPOIB_MCAST_FLAG_BUSY 	  = 2,	/* joining or already joined */
+	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
 
-	MAX_SEND_CQE              = 16,
-	UD_POST_RCV_COUNT         = 16,
-	CM_POST_SRQ_COUNT         = 16,
+	IPOIB_MAX_LRO_DESCRIPTORS = 8,
+	IPOIB_LRO_MAX_AGGR 	  = 64,
 
-	SKB_TSHOLD		  = 256,
+	MAX_SEND_CQE		  = 16,
+	IPOIB_CM_COPYBREAK	  = 256,
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -113,11 +116,6 @@ enum {
 
 /* structs */
 
-struct ipoib_cm_tx_buf {
-	struct sk_buff *skb;
-	u64		mapping;
-};
-
 struct ipoib_header {
 	__be16	proto;
 	u16	reserved;
@@ -131,7 +129,7 @@ struct ipoib_pseudoheader {
 struct ipoib_mcast {
 	struct ib_sa_mcmember_rec mcmember;
 	struct ib_sa_multicast	 *mc;
-	struct ipoib_ah          *ah;
+	struct ipoib_ah		 *ah;
 
 	struct rb_node    rb_node;
 	struct list_head  list;
@@ -149,7 +147,7 @@ struct ipoib_mcast {
 	struct net_device *dev;
 };
 
-struct ipoib_sg_rx_buf {
+struct ipoib_rx_buf {
 	struct sk_buff *skb;
 	u64		mapping[IPOIB_UD_RX_SG];
 };
@@ -159,71 +157,10 @@ struct ipoib_tx_buf {
 	u64		mapping[MAX_SKB_FRAGS + 1];
 };
 
-static inline int ipoib_dma_map_tx(struct ib_device *ca,
-				   struct ipoib_tx_buf *tx_req)
-{
-	struct sk_buff *skb = tx_req->skb;
-	u64 *mapping = tx_req->mapping;
-	int i;
-	int nfrags;
-	int off;
-
-	if (skb_headlen(skb)) {
-		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
-					       DMA_TO_DEVICE);
-		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
-			return -EIO;
-		off = 1;
-	} else
-		off = 0;
-
-	nfrags = skb_shinfo(skb)->nr_frags;
-	for (i = 0; i < nfrags; ++i) {
-		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-		mapping[i + off] = ib_dma_map_page(ca, frag->page, frag->page_offset,
-						   frag->size, DMA_TO_DEVICE);
-		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
-			goto partial_error;
-	}
-	return 0;
-
-partial_error:
-	if (skb_headlen(skb)) {
-		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
-		off = 0;
-	} else
-		off = 1;
-
-	for (; i > 0; --i) {
-		skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
-		ib_dma_unmap_page(ca, mapping[i - off], frag->size,
-				  DMA_TO_DEVICE);
-	}
-	return -EIO;
-}
-
-static inline void ipoib_dma_unmap_tx(struct ib_device *ca,
-				      struct ipoib_tx_buf *tx_req)
-{
-	struct sk_buff *skb = tx_req->skb;
-	u64 *mapping = tx_req->mapping;
-	int i;
-	int nfrags;
-	int off;
-
-	if (skb_headlen(skb)) {
-		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
-		off = 1;
-	} else
-		off = 0;
-
-	nfrags = skb_shinfo(skb)->nr_frags;
-	for (i = 0; i < nfrags; ++i) {
-		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-		ib_dma_unmap_page(ca, mapping[i + off], frag->size,
-				  DMA_TO_DEVICE);
-	}
-}
+struct ipoib_cm_tx_buf {
+	struct sk_buff *skb;
+	u64		mapping;
+};
 
 struct ib_cm_id;
 
@@ -266,36 +203,28 @@ enum ipoib_cm_state {
 };
 
 struct ipoib_cm_rx {
-	struct ib_cm_id     *id;
-	struct ib_qp        *qp;
+	struct ib_cm_id	       *id;
+	struct ib_qp	       *qp;
 	struct ipoib_cm_rx_buf *rx_ring;
-	struct list_head     list;
-	struct net_device   *dev;
-	unsigned long        jiffies;
-	enum ipoib_cm_state  state;
-	int		     index;
-	int		     recv_count;
-};
-
-struct ipoib_vmap {
-	void	       *ptr;
-	struct page   **page_arr;
-	int		npages;
+	struct list_head	list;
+	struct net_device      *dev;
+	unsigned long		jiffies;
+	enum ipoib_cm_state	state;
+	int			recv_count;
 };
 
 struct ipoib_cm_tx {
-	struct ib_cm_id     *id;
-	struct ib_qp        *qp;
+	struct ib_cm_id	    *id;
+	struct ib_qp	    *qp;
 	struct list_head     list;
 	struct net_device   *dev;
 	struct ipoib_neigh  *neigh;
 	struct ipoib_path   *path;
-	struct ipoib_vmap    tx_vmap_ring;
 	struct ipoib_cm_tx_buf *tx_ring;
-	unsigned             tx_head;
-	unsigned             tx_tail;
-	unsigned long        flags;
-	u32                  mtu;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	unsigned long	     flags;
+	u32		     mtu;
 };
 
 struct ipoib_cm_rx_buf {
@@ -303,40 +232,29 @@ struct ipoib_cm_rx_buf {
 	u64 mapping[IPOIB_CM_RX_SG];
 };
 
-struct ipoib_cm_rx_wr {
-	struct ib_recv_wr	wr;
-	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
-};
-
 struct ipoib_cm_dev_priv {
-	struct ib_srq  	       *srq;
-	struct ipoib_vmap 	rx_vmap_srq_ring;
+	struct ib_srq	       *srq;
 	struct ipoib_cm_rx_buf *srq_ring;
-	struct ib_cm_id        *id;
-	struct list_head        passive_ids;   /* state: LIVE */
-	struct list_head        rx_error_list; /* state: ERROR */
-	struct list_head        rx_flush_list; /* state: FLUSH, drain not started */
-	struct list_head        rx_drain_list; /* state: FLUSH, drain started */
-	struct list_head        rx_reap_list;  /* state: FLUSH, drain done */
+	struct ib_cm_id	       *id;
+	struct list_head	passive_ids;   /* state: LIVE */
+	struct list_head	rx_error_list; /* state: ERROR */
+	struct list_head	rx_flush_list; /* state: FLUSH, drain not started */
+	struct list_head	rx_drain_list; /* state: FLUSH, drain started */
+	struct list_head	rx_reap_list;  /* state: FLUSH, drain done */
 	struct work_struct      start_task;
 	struct work_struct      reap_task;
 	struct work_struct      skb_task;
 	struct work_struct      rx_reap_task;
 	struct delayed_work     stale_task;
 	struct sk_buff_head     skb_queue;
-	struct list_head        start_list;
-	struct list_head        reap_list;
-	struct ib_wc            ibwc[IPOIB_NUM_WC];
-	struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
+	struct list_head	start_list;
+	struct list_head	reap_list;
+	struct ib_wc		ibwc[IPOIB_NUM_WC];
+	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
 	struct ib_recv_wr       rx_wr;
 	int			nonsrq_conn_qp;
 	int			max_cm_mtu;
 	int			num_frags;
-	struct ipoib_cm_rx_wr  *head;
-	struct ipoib_cm_rx_wr  *tail;
-	struct ipoib_vmap 	rx_vmap_wr_arr;
-	struct ipoib_cm_rx_wr  *rx_wr_arr;
-	int			rx_skipped;
 };
 
 struct ipoib_ethtool_st {
@@ -344,23 +262,23 @@ struct ipoib_ethtool_st {
 	u16     max_coalesced_frames;
 };
 
+struct ipoib_lro {
+	struct net_lro_mgr lro_mgr;
+	struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS];
+};
+
 /*
- * Device private locking: tx_lock protects members used in TX fast
- * path (and we use LLTX so upper layers don't do extra locking).
- * lock protects everything else.  lock nests inside of tx_lock (ie
- * tx_lock must be acquired first if needed).
+ * Device private locking: network stack tx_lock protects members used
+ * in TX fast path, lock protects everything else.  lock nests inside
+ * of tx_lock (ie tx_lock must be acquired first if needed).
  */
 struct ipoib_dev_priv {
 	spinlock_t lock;
 
-	struct net_device      *dev;
-	struct ib_recv_wr	rx_wr_draft[UD_POST_RCV_COUNT];
-	struct ib_sge 		sglist_draft[UD_POST_RCV_COUNT][IPOIB_UD_RX_SG];
-	unsigned int		rx_outst;
+	struct net_device *dev;
 
 	unsigned long flags;
 
-	struct mutex mcast_mutex;
 	struct mutex vlan_mutex;
 
 	struct rb_root  path_tree;
@@ -372,43 +290,45 @@ struct ipoib_dev_priv {
 
 	struct delayed_work pkey_poll_task;
 	struct delayed_work mcast_task;
-	struct work_struct flush_task;
+	struct work_struct carrier_on_task;
+	struct work_struct flush_light;
+	struct work_struct flush_normal;
+	struct work_struct flush_heavy;
 	struct work_struct restart_task;
 	struct delayed_work ah_reap_task;
-	struct work_struct pkey_event_task;
 
 	struct ib_device *ca;
-	u8            	  port;
-	u16           	  pkey;
-	u16               pkey_index;
-	struct ib_pd  	 *pd;
-	struct ib_mr  	 *mr;
-	struct ib_cq  	 *rcq;
-	struct ib_cq  	 *scq;
-	struct ib_qp  	 *qp;
-	u32           	  qkey;
+	u8		  port;
+	u16		  pkey;
+	u16		  pkey_index;
+	struct ib_pd	 *pd;
+	struct ib_mr	 *mr;
+	struct ib_cq	 *recv_cq;
+	struct ib_cq	 *send_cq;
+	struct ib_qp	 *qp;
+	u32		  qkey;
 
 	union ib_gid local_gid;
-	u16          local_lid;
+	u16	     local_lid;
 
 	unsigned int admin_mtu;
 	unsigned int mcast_mtu;
+	unsigned int max_ib_mtu;
 
-	struct ipoib_vmap	rx_vmap_ring;
-	struct ipoib_sg_rx_buf *rx_ring;
+	struct ipoib_rx_buf *rx_ring;
 
-	spinlock_t           tx_lock;
-	struct ipoib_vmap    tx_vmap_ring;
 	struct ipoib_tx_buf *tx_ring;
-	unsigned             tx_head;
-	unsigned             tx_tail;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
 	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
 	struct ib_send_wr    tx_wr;
-	unsigned             tx_outstanding;
+	unsigned	     tx_outstanding;
+	struct ib_wc	     send_wc[MAX_SEND_CQE];
 
-	struct ib_wc 	     ibwc[IPOIB_NUM_WC];
-	struct ib_wc         send_wc[MAX_SEND_CQE];
-	unsigned int	     tx_poll;
+	struct ib_recv_wr    rx_wr;
+	struct ib_sge	     rx_sge[IPOIB_UD_RX_SG];
+
+	struct ib_wc ibwc[IPOIB_NUM_WC];
 
 	struct list_head dead_ahs;
 
@@ -429,18 +349,19 @@ struct ipoib_dev_priv {
 	struct dentry *mcg_dentry;
 	struct dentry *path_dentry;
 #endif
-	struct ipoib_ethtool_st etool;
+	int	hca_caps;
+	struct ipoib_ethtool_st ethtool;
 	struct timer_list poll_timer;
-	struct ib_ah *own_ah;
- 	int max_ib_mtu;
+
+	struct ipoib_lro lro;
 };
 
 struct ipoib_ah {
 	struct net_device *dev;
-	struct ib_ah      *ah;
+	struct ib_ah	  *ah;
 	struct list_head   list;
-	struct kref        ref;
-	unsigned           last_send;
+	struct kref	   ref;
+	unsigned	   last_send;
 };
 
 struct ipoib_path {
@@ -451,12 +372,13 @@ struct ipoib_path {
 
 	struct list_head      neigh_list;
 
-	int                   query_id;
+	int		      query_id;
 	struct ib_sa_query   *query;
 	struct completion     done;
 
-	struct rb_node        rb_node;
+	struct rb_node	      rb_node;
 	struct list_head      list;
+	int  		      valid;
 };
 
 struct ipoib_neigh {
@@ -464,7 +386,7 @@ struct ipoib_neigh {
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	struct ipoib_cm_tx *cm;
 #endif
-	union ib_gid        dgid;
+	union ib_gid	    dgid;
 	struct sk_buff_head queue;
 
 	struct neighbour   *neighbour;
@@ -475,18 +397,10 @@ struct ipoib_neigh {
 
 #define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
 #define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
-static inline int ipoib_ud_need_sg(int ib_mtu)
-{
-	return (IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE) ? 1 : 0;
-}
-static inline void ipoib_sg_dma_unmap_rx(struct ipoib_dev_priv *priv,
-					 u64 mapping[IPOIB_UD_RX_SG])
+
+static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
 {
-	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, DMA_FROM_DEVICE);
-		ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, DMA_FROM_DEVICE);
-	} else
-		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), DMA_FROM_DEVICE);
+	return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
 }
 
 /*
@@ -510,7 +424,8 @@ extern struct workqueue_struct *ipoib_workqueue;
 /* functions */
 
 int ipoib_poll(struct net_device *dev, int *budget);
-void ipoib_ib_rx_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
 				 struct ib_pd *pd, struct ib_ah_attr *attr);
@@ -528,11 +443,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		struct ipoib_ah *address, u32 qpn);
 void ipoib_reap_ah(struct work_struct *work);
 
+void ipoib_mark_paths_invalid(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
-void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
@@ -545,6 +463,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_dev_cleanup(struct net_device *dev);
 
 void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_carrier_on_task(struct work_struct *work);
 void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb);
 
 void ipoib_mcast_restart_task(struct work_struct *work);
@@ -554,6 +473,9 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);
 
+int ipoib_set_cleanup_function(void (*func)(struct neighbour *n));
+void ipoib_neigh_cleanup_container(struct neighbour *n);
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
 int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
@@ -571,9 +493,7 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 #endif
 
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
-		       union ib_gid *mgid);
-int ipoib_mcast_detach(struct net_device *dev, u16 mlid,
-		       union ib_gid *mgid);
+		       union ib_gid *mgid, int set_qkey);
 
 int ipoib_init_qp(struct net_device *dev);
 int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
@@ -590,14 +510,11 @@ int ipoib_pkey_dev_delay_open(struct net_device *dev);
 void ipoib_drain_cq(struct net_device *dev);
 
 void ipoib_set_ethtool_ops(struct net_device *dev);
-void destroy_own_ah(struct ipoib_dev_priv *priv);
-int ipoib_vmalloc(struct ipoib_vmap *buf, int size);
-void ipoib_vfree(struct ipoib_vmap *buf);
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 
-#define IPOIB_FLAGS_RC          0x80
-#define IPOIB_FLAGS_UC          0x40
+#define IPOIB_FLAGS_RC		0x80
+#define IPOIB_FLAGS_UC		0x40
 
 /* We don't support UC connections at the moment */
 #define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
@@ -634,16 +551,16 @@ static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *t
 	neigh->cm = tx;
 }
 
-static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+static inline int ipoib_cm_has_srq(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	return priv->cm.max_cm_mtu;
+	return !!priv->cm.srq;
 }
 
-static inline int ipoib_cm_has_srq(struct net_device *dev)
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	return !!priv->cm.srq;
+	return priv->cm.max_cm_mtu;
 }
 
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
@@ -655,7 +572,7 @@ void ipoib_cm_dev_cleanup(struct net_device *dev);
 struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
 				    struct ipoib_neigh *neigh);
 void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
-void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 			   unsigned int mtu);
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
 void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
@@ -690,12 +607,12 @@ static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *t
 {
 }
 
-static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+static inline int ipoib_cm_has_srq(struct net_device *dev)
 {
 	return 0;
 }
 
-static inline int ipoib_cm_has_srq(struct net_device *dev)
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
 {
 	return 0;
 }
@@ -749,7 +666,7 @@ int ipoib_cm_add_mode_attr(struct net_device *dev)
 	return 0;
 }
 
-static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 					 unsigned int mtu)
 {
 	dev_kfree_skb_any(skb);
@@ -790,12 +707,12 @@ extern struct ib_sa_client ipoib_sa_client;
 extern int ipoib_debug_level;
 
 #define ipoib_dbg(priv, format, arg...)			\
-	do {					        \
+	do {						\
 		if (ipoib_debug_level > 0)			\
 			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
 	} while (0)
 #define ipoib_dbg_mcast(priv, format, arg...)		\
-	do {					        \
+	do {						\
 		if (mcast_debug_level > 0)		\
 			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
 	} while (0)
@@ -808,7 +725,7 @@ extern int ipoib_debug_level;
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
 #define ipoib_dbg_data(priv, format, arg...)		\
-	do {					        \
+	do {						\
 		if (data_debug_level > 0)		\
 			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
 	} while (0)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 315a434..3092521 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id$
  */
 
 #include <rdma/ib_cm.h>
@@ -40,6 +38,8 @@
 #include <linux/delay.h>
 #include <linux/vmalloc.h>
 
+#include "ipoib.h"
+
 int ipoib_max_conn_qp = 128;
 
 module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
@@ -47,11 +47,6 @@ MODULE_PARM_DESC(max_nonsrq_conn_qp,
 		 "Max number of connected-mode QPs per interface "
 		 "(applied only if shared receive queue is not available)");
 
-static int ipoib_set_nonsrq = 0;
-module_param_named(set_nonsrq, ipoib_set_nonsrq, int, 0644);
-MODULE_PARM_DESC(set_nonsrq, "set to dictate working in none SRQ mode"
-		 ", otherwise act according to device capabilities");
-
 #define to_net_dev(class) container_of(class, struct net_device, class_dev)
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
@@ -62,8 +57,6 @@ MODULE_PARM_DESC(cm_data_debug_level,
 		 "Enable data path debug tracing for connected mode if > 0");
 #endif
 
-#include "ipoib.h"
-
 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
 
 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
@@ -96,49 +89,24 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
 		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
 }
 
-static int ipoib_cm_post_receive_srq(struct net_device *dev, int id, int pi)
+static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_recv_wr *bad_wr;
-	int i, ret = 0;
-	struct ipoib_cm_rx_wr *cur;
-	int post;
-
-	ipoib_dbg_data(priv, "posting to id=%d, pi=%d\n", id, pi);
-	cur = &priv->cm.rx_wr_arr[id];
-	if (!priv->cm.head)
-		priv->cm.head = cur;
-
-	if (priv->cm.tail)
-		priv->cm.tail->wr.next = &cur->wr;
+	int i, ret;
 
-	cur->wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
-	cur->wr.next = NULL;
+	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	for (i = 0; i < priv->cm.num_frags; ++i)
-		cur->rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
-
-	post = pi || (priv->cm.rx_skipped >= CM_POST_SRQ_COUNT);
-	if (post) {
-		ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.head->wr, &bad_wr);
-		if (unlikely(ret)) {
-			ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
-			while (bad_wr) {
-				id = bad_wr->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
-				ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
-						      priv->cm.srq_ring[id].mapping);
-				dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
-				priv->cm.srq_ring[id].skb = NULL;
-				bad_wr = bad_wr->next;
-			}
-		} else {
-			priv->cm.rx_skipped = 0;
-			priv->cm.head = NULL;
-			priv->cm.tail = NULL;
-		}
-	} else {
-		++priv->cm.rx_skipped;
-		priv->cm.tail = cur;
+		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
+
+	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+		ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
+				      priv->cm.srq_ring[id].mapping);
+		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
+		priv->cm.srq_ring[id].skb = NULL;
 	}
 
 	return ret;
@@ -236,10 +204,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev,
 			dev_kfree_skb_any(rx_ring[i].skb);
 		}
 
-	if (ipoib_cm_has_srq(dev))
-		ipoib_vfree(&priv->cm.rx_vmap_srq_ring);
-	else
-		 vfree(rx_ring);
+	vfree(rx_ring);
 }
 
 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
@@ -286,8 +251,8 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
 		.event_handler = ipoib_cm_rx_event_handler,
-		.send_cq = priv->rcq, /* For drain WR */
-		.recv_cq = priv->rcq,
+		.send_cq = priv->recv_cq, /* For drain WR */
+		.recv_cq = priv->recv_cq,
 		.srq = priv->cm.srq,
 		.cap.max_send_wr = 1, /* For drain WR */
 		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
@@ -360,19 +325,18 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
 }
 
 static void ipoib_cm_init_rx_wr(struct net_device *dev,
-struct ib_recv_wr *wr,
-struct ib_sge *sge)
+				struct ib_recv_wr *wr,
+				struct ib_sge *sge)
 {
-
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int i;
 
 	for (i = 0; i < priv->cm.num_frags; ++i)
-			sge[i].lkey = priv->mr->lkey;
+		sge[i].lkey = priv->mr->lkey;
 
 	sge[0].length = IPOIB_CM_HEAD_SIZE;
 	for (i = 1; i < priv->cm.num_frags; ++i)
-			sge[i].length = PAGE_SIZE;
+		sge[i].length = PAGE_SIZE;
 
 	wr->next    = NULL;
 	wr->sg_list = sge;
@@ -387,20 +351,22 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i
 		struct ib_recv_wr wr;
 		struct ib_sge sge[IPOIB_CM_RX_SG];
 	} *t;
-
 	int ret;
 	int i;
 
 	rx->rx_ring = vmalloc(ipoib_recvq_size * sizeof *rx->rx_ring);
-	if (!rx->rx_ring){
-		printk(KERN_WARNING "ib: Allocation of rx_ring failed,%s\n",
-		"try using a lower value of recv_queue_size.");
+	if (!rx->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
 		return -ENOMEM;
 	}
+
+	memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring);
+
 	t = kmalloc(sizeof *t, GFP_KERNEL);
 	if (!t) {
 		ret = -ENOMEM;
-		goto err_free_ring;
+		goto err_free;
 	}
 
 	ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
@@ -446,7 +412,6 @@ err_count:
 
 err_free:
 	kfree(t);
-err_free_ring:
 	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
 
 	return ret;
@@ -656,7 +621,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		}
 	}
 
-	if (wc->byte_len < SKB_TSHOLD) {
+	if (wc->byte_len < IPOIB_CM_COPYBREAK) {
 		int dlen = wc->byte_len;
 
 		small_skb = dev_alloc_skb(dlen + 12);
@@ -711,14 +676,14 @@ copied:
 
 repost:
 	if (has_srq) {
-		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id, 0)))
+		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
 			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
 				   "for buf %d\n", wr_id);
 	} else {
 		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
-							&priv->cm.rx_wr,
-							priv->cm.rx_sge,
-							wr_id))) {
+							  &priv->cm.rx_wr,
+							  priv->cm.rx_sge,
+							  wr_id))) {
 			--p->recv_count;
 			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
 				   "for buf %d\n", wr_id);
@@ -736,7 +701,8 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 	priv->tx_sge[0].addr          = addr;
 	priv->tx_sge[0].length        = len;
 
-	priv->tx_wr.wr_id 	      = wr_id | IPOIB_OP_CM;
+	priv->tx_wr.num_sge	= 1;
+	priv->tx_wr.wr_id	= wr_id | IPOIB_OP_CM;
 
 	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
@@ -787,9 +753,8 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 		dev->trans_start = jiffies;
 		++tx->tx_head;
 
-		if (++priv->tx_outstanding == ipoib_sendq_size - 1) {
-			ipoib_dbg(priv, "%s: TX ring 0x%x full,"
-				  "stopping kernel net queue\n", __func__,
+		if (++priv->tx_outstanding == ipoib_sendq_size) {
+			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
 				  tx->qp->qp_num);
 			netif_stop_queue(dev);
 		}
@@ -823,7 +788,8 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	dev_kfree_skb_any(tx_req->skb);
 
-	spin_lock_irqsave(&priv->tx_lock, flags);
+	netif_tx_lock(dev);
+
 	++tx->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 	    netif_queue_stopped(dev) &&
@@ -838,7 +804,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
 
-		spin_lock(&priv->lock);
+		spin_lock_irqsave(&priv->lock, flags);
 		neigh = tx->neigh;
 
 		if (neigh) {
@@ -858,10 +824,10 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
 
-		spin_unlock(&priv->lock);
+		spin_unlock_irqrestore(&priv->lock, flags);
 	}
 
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
+	netif_tx_unlock(dev);
 }
 
 int ipoib_cm_dev_open(struct net_device *dev)
@@ -924,7 +890,6 @@ void ipoib_cm_dev_stop(struct net_device *dev)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_rx *p;
 	unsigned long begin;
-	LIST_HEAD(list);
 	int ret;
 
 	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
@@ -957,9 +922,12 @@ void ipoib_cm_dev_stop(struct net_device *dev)
 			/*
 			 * assume the HW is wedged and just free up everything.
 			 */
-			list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_reap_list);
-			list_splice_init(&priv->cm.rx_error_list, &priv->cm.rx_reap_list);
-			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_flush_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_error_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_drain_list,
+					 &priv->cm.rx_reap_list);
 			break;
 		}
 		spin_unlock_irq(&priv->lock);
@@ -1047,8 +1015,8 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
-		.send_cq		= priv->rcq,
-		.recv_cq		= priv->rcq,
+		.send_cq		= priv->recv_cq,
+		.recv_cq		= priv->recv_cq,
 		.srq			= priv->cm.srq,
 		.cap.max_send_wr	= ipoib_sendq_size,
 		.cap.max_send_sge	= 1,
@@ -1072,28 +1040,28 @@ static int ipoib_cm_send_req(struct net_device *dev,
 	data.qpn = cpu_to_be32(priv->qp->qp_num);
 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
 
-	req.primary_path 	      = pathrec;
-	req.alternate_path 	      = NULL;
-	req.service_id                = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
-	req.qp_num 		      = qp->qp_num;
-	req.qp_type 		      = qp->qp_type;
-	req.private_data 	      = &data;
-	req.private_data_len 	      = sizeof data;
-	req.flow_control 	      = 0;
+	req.primary_path		= pathrec;
+	req.alternate_path		= NULL;
+	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+	req.qp_num			= qp->qp_num;
+	req.qp_type			= qp->qp_type;
+	req.private_data		= &data;
+	req.private_data_len		= sizeof data;
+	req.flow_control		= 0;
 
-	req.starting_psn              = 0; /* FIXME */
+	req.starting_psn		= 0; /* FIXME */
 
 	/*
 	 * Pick some arbitrary defaults here; we could make these
 	 * module parameters if anyone cared about setting them.
 	 */
-	req.responder_resources	      = 4;
-	req.remote_cm_response_timeout = 20;
-	req.local_cm_response_timeout  = 20;
-	req.retry_count 	      = 0; /* RFC draft warns against retries */
-	req.rnr_retry_count 	      = 0; /* RFC draft warns against retries */
-	req.max_cm_retries 	      = 15;
-	req.srq 	              = ipoib_cm_has_srq(dev);
+	req.responder_resources		= 4;
+	req.remote_cm_response_timeout	= 20;
+	req.local_cm_response_timeout	= 20;
+	req.retry_count			= 0; /* RFC draft warns against retries */
+	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
+	req.max_cm_retries		= 15;
+	req.srq				= ipoib_cm_has_srq(dev);
 	return ib_send_cm_req(id, &req);
 }
 
@@ -1128,11 +1096,13 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
 	int ret;
 
-	if (ipoib_vmalloc(&p->tx_vmap_ring, ipoib_sendq_size * sizeof *p->tx_ring)) {
+	p->tx_ring = vmalloc(ipoib_sendq_size * sizeof *p->tx_ring);
+	if (!p->tx_ring) {
+		ipoib_warn(priv, "failed to allocate tx ring\n");
 		ret = -ENOMEM;
 		goto err_tx;
 	}
-	p->tx_ring = p->tx_vmap_ring.ptr;
+	memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
 
 	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
 	if (IS_ERR(p->qp)) {
@@ -1173,6 +1143,7 @@ err_id:
 	ib_destroy_qp(p->qp);
 err_qp:
 	p->qp = NULL;
+	vfree(p->tx_ring);
 err_tx:
 	return ret;
 }
@@ -1181,7 +1152,6 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
 	struct ipoib_cm_tx_buf *tx_req;
-	unsigned long flags;
 	unsigned long begin;
 
 	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
@@ -1212,18 +1182,18 @@ timeout:
 				    DMA_TO_DEVICE);
 		dev_kfree_skb_any(tx_req->skb);
 		++p->tx_tail;
-		spin_lock_irqsave(&priv->tx_lock, flags);
+		netif_tx_lock_bh(p->dev);
 		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 		    netif_queue_stopped(p->dev) &&
 		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 			netif_wake_queue(p->dev);
-		spin_unlock_irqrestore(&priv->tx_lock, flags);
+		netif_tx_unlock_bh(p->dev);
 	}
 
 	if (p->qp)
 		ib_destroy_qp(p->qp);
 
-	ipoib_vfree(&p->tx_vmap_ring);
+	vfree(p->tx_ring);
 	kfree(p);
 }
 
@@ -1234,6 +1204,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 	struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
 	struct net_device *dev = priv->dev;
 	struct ipoib_neigh *neigh;
+	unsigned long flags;
 	int ret;
 
 	switch (event->event) {
@@ -1252,8 +1223,8 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 	case IB_CM_REJ_RECEIVED:
 	case IB_CM_TIMEWAIT_EXIT:
 		ipoib_dbg(priv, "CM error %d.\n", event->event);
-		spin_lock_irq(&priv->tx_lock);
-		spin_lock(&priv->lock);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
 		neigh = tx->neigh;
 
 		if (neigh) {
@@ -1271,8 +1242,8 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 			queue_work(ipoib_workqueue, &priv->cm.reap_task);
 		}
 
-		spin_unlock(&priv->lock);
-		spin_unlock_irq(&priv->tx_lock);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
 		break;
 	default:
 		break;
@@ -1326,19 +1297,24 @@ static void ipoib_cm_tx_start(struct work_struct *work)
 	struct ib_sa_path_rec pathrec;
 	u32 qpn;
 
-	spin_lock_irqsave(&priv->tx_lock, flags);
-	spin_lock(&priv->lock);
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
 	while (!list_empty(&priv->cm.start_list)) {
 		p = list_entry(priv->cm.start_list.next, typeof(*p), list);
 		list_del_init(&p->list);
 		neigh = p->neigh;
 		qpn = IPOIB_QPN(neigh->neighbour->ha);
 		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
-		spin_unlock(&priv->lock);
-		spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+
 		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
-		spin_lock_irqsave(&priv->tx_lock, flags);
-		spin_lock(&priv->lock);
+
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+
 		if (ret) {
 			neigh = p->neigh;
 			if (neigh) {
@@ -1352,44 +1328,52 @@ static void ipoib_cm_tx_start(struct work_struct *work)
 			kfree(p);
 		}
 	}
-	spin_unlock(&priv->lock);
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
 }
 
 static void ipoib_cm_tx_reap(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.reap_task);
+	struct net_device *dev = priv->dev;
 	struct ipoib_cm_tx *p;
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
 
-	spin_lock_irq(&priv->tx_lock);
-	spin_lock(&priv->lock);
 	while (!list_empty(&priv->cm.reap_list)) {
 		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
 		list_del(&p->list);
-		spin_unlock(&priv->lock);
-		spin_unlock_irq(&priv->tx_lock);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
 		ipoib_cm_tx_destroy(p);
-		spin_lock_irq(&priv->tx_lock);
-		spin_lock(&priv->lock);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
 	}
-	spin_unlock(&priv->lock);
-	spin_unlock_irq(&priv->tx_lock);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
 }
 
 static void ipoib_cm_skb_reap(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.skb_task);
+	struct net_device *dev = priv->dev;
 	struct sk_buff *skb;
-
+	unsigned long flags;
 	unsigned mtu = priv->mcast_mtu;
 
-	spin_lock_irq(&priv->tx_lock);
-	spin_lock(&priv->lock);
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
 	while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
-		spin_unlock(&priv->lock);
-		spin_unlock_irq(&priv->tx_lock);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+
 		if (skb->protocol == htons(ETH_P_IP))
 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -1397,11 +1381,13 @@ static void ipoib_cm_skb_reap(struct work_struct *work)
 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, priv->dev);
 #endif
 		dev_kfree_skb_any(skb);
-		spin_lock_irq(&priv->tx_lock);
-		spin_lock(&priv->lock);
+
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
 	}
-	spin_unlock(&priv->lock);
-	spin_unlock_irq(&priv->tx_lock);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
 }
 
 void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
@@ -1474,14 +1460,15 @@ static ssize_t set_mode(struct class_device *d, const char *buf, size_t count)
 		ipoib_warn(priv, "enabling connected mode "
 			   "will cause multicast packet drops\n");
 
+		rtnl_lock();
 		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO);
-
 		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 
 		if (ipoib_cm_max_mtu(dev) > priv->mcast_mtu)
 			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 				   priv->mcast_mtu);
 		dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
+		rtnl_unlock();
 
 		ipoib_flush_paths(dev);
 		return count;
@@ -1489,16 +1476,16 @@ static ssize_t set_mode(struct class_device *d, const char *buf, size_t count)
 
 	if (!strcmp(buf, "datagram\n")) {
 		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
-		dev->mtu = min(priv->mcast_mtu, dev->mtu);
-		ipoib_flush_paths(dev);
 
-		if (priv->ca->flags & IB_DEVICE_IP_CSUM)
+		rtnl_lock();
+		if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) {
 			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
-
-
-		if (priv->dev->features & NETIF_F_SG &&
-		    priv->ca->flags & IB_DEVICE_TCP_TSO)
-			priv->dev->features |= NETIF_F_TSO;
+			if (priv->hca_caps & IB_DEVICE_UD_TSO)
+				dev->features |= NETIF_F_TSO;
+		}
+		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
+		rtnl_unlock();
+		ipoib_flush_paths(dev);
 
 		return count;
 	}
@@ -1523,9 +1510,6 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
 		}
 	};
 
-	if (ipoib_set_nonsrq)
-		return;
-
 	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
 	if (IS_ERR(priv->cm.srq)) {
 		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
@@ -1535,35 +1519,22 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
 		return;
 	}
 
-	if (ipoib_vmalloc(&priv->cm.rx_vmap_wr_arr, ipoib_recvq_size *
-			  sizeof priv->cm.rx_wr_arr[0])) {
-		ipoib_warn(priv, "failed allocating SRQ wr array\n");
-		goto destory_srq;
-	}
-        priv->cm.rx_wr_arr = priv->cm.rx_vmap_wr_arr.ptr;
-
-	if (ipoib_vmalloc(&priv->cm.rx_vmap_srq_ring, ipoib_recvq_size *
-			  sizeof *priv->cm.srq_ring)) {
+	priv->cm.srq_ring = vmalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring);
+	if (!priv->cm.srq_ring) {
 		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
 		       priv->ca->name, ipoib_recvq_size);
-		goto free_wr_array;
+		ib_destroy_srq(priv->cm.srq);
+		priv->cm.srq = NULL;
+		return;
 	}
-	priv->cm.srq_ring = priv->cm.rx_vmap_srq_ring.ptr;
-
-	return;
 
-free_wr_array:
-	ipoib_vfree(&priv->cm.rx_vmap_wr_arr);
-	priv->cm.rx_wr_arr = NULL;
-destory_srq:
-	ib_destroy_srq(priv->cm.srq);
-	priv->cm.srq = NULL;
+	memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring);
 }
 
 int ipoib_cm_dev_init(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	int i, ret, j;
+	int i, ret;
 	struct ib_device_attr attr;
 
 	INIT_LIST_HEAD(&priv->cm.passive_ids);
@@ -1589,9 +1560,10 @@ int ipoib_cm_dev_init(struct net_device *dev)
 
 	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
 
-	attr.max_srq_sge = min(IPOIB_CM_RX_SG, attr.max_srq_sge);
+	attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
 	ipoib_cm_create_srq(dev, attr.max_srq_sge);
 	if (ipoib_cm_has_srq(dev)) {
+
 		priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
 		priv->cm.num_frags  = attr.max_srq_sge;
 		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
@@ -1601,25 +1573,6 @@ int ipoib_cm_dev_init(struct net_device *dev)
 		priv->cm.num_frags  = IPOIB_CM_RX_SG;
 	}
 
-	if (ipoib_cm_has_srq(dev)) {
-		for (j = 0; j < ipoib_recvq_size; ++j) {
-			for (i = 0; i < priv->cm.num_frags; ++i)
-				priv->cm.rx_wr_arr[j].rx_sge[i].lkey =
-				priv->mr->lkey;
-
-			priv->cm.rx_wr_arr[j].rx_sge[0].length =
-			IPOIB_CM_HEAD_SIZE;
-			for (i = 1; i < priv->cm.num_frags; ++i)
-				priv->cm.rx_wr_arr[j].rx_sge[i].length =
-				PAGE_SIZE;
-
-			priv->cm.rx_wr_arr[j].wr.sg_list =
-			priv->cm.rx_wr_arr[j].rx_sge;
-			priv->cm.rx_wr_arr[j].wr.num_sge = priv->cm.num_frags;
-		}
-		priv->cm.head = &priv->cm.rx_wr_arr[0];
-	}
-
 	ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
 
 	if (ipoib_cm_has_srq(dev)) {
@@ -1627,12 +1580,15 @@ int ipoib_cm_dev_init(struct net_device *dev)
 			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
 						   priv->cm.num_frags - 1,
 						   priv->cm.srq_ring[i].mapping)) {
-				ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+				ipoib_warn(priv, "failed to allocate "
+					   "receive buffer %d\n", i);
 				ipoib_cm_dev_cleanup(dev);
 				return -ENOMEM;
 			}
-			if (ipoib_cm_post_receive_srq(dev, i, 1)) {
-				ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+
+			if (ipoib_cm_post_receive_srq(dev, i)) {
+				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
+					   "failed for buf %d\n", i);
 				ipoib_cm_dev_cleanup(dev);
 				return -EIO;
 			}
@@ -1648,21 +1604,19 @@ void ipoib_cm_dev_cleanup(struct net_device *dev)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int ret;
 
-	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
-
 	if (!priv->cm.srq)
 		return;
 
-	if (priv->cm.srq_ring) {
-		ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
-		priv->cm.srq_ring = NULL;
-	}
+	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
 
 	ret = ib_destroy_srq(priv->cm.srq);
 	if (ret)
 		ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
-	else
-		priv->cm.srq = NULL;
 
-	ipoib_vfree(&priv->cm.rx_vmap_wr_arr);
+	priv->cm.srq = NULL;
+	if (!priv->cm.srq_ring)
+		return;
+
+	ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
+	priv->cm.srq_ring = NULL;
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
new file mode 100644
index 0000000..d462447
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "ipoib.h"
+
+static void ipoib_get_drvinfo(struct net_device *netdev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1);
+}
+
+static int ipoib_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+	coal->tx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+	coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+	coal->tx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+
+	return 0;
+}
+
+static int ipoib_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	/*
+	 * Since IPoIB uses a single CQ for both rx and tx, we assume
+	 * that rx params dictate the configuration.  These values are
+	 * saved in the private data and returned when ipoib_get_coalesce()
+	 * is called.
+	 */
+	if (coal->rx_coalesce_usecs       > 0xffff ||
+	    coal->rx_max_coalesced_frames > 0xffff)
+		return -EINVAL;
+
+	ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames,
+			   coal->rx_coalesce_usecs);
+	if (ret && ret != -ENOSYS) {
+		ipoib_warn(priv, "failed modifying CQ (%d)\n", ret);
+		return ret;
+	}
+
+	coal->tx_coalesce_usecs       = coal->rx_coalesce_usecs;
+	coal->tx_max_coalesced_frames = coal->rx_max_coalesced_frames;
+	priv->ethtool.coalesce_usecs       = coal->rx_coalesce_usecs;
+	priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames;
+
+	return 0;
+}
+
+static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = {
+	"LRO aggregated", "LRO flushed",
+	"LRO avg aggr", "LRO no desc"
+};
+
+static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+{
+	switch (stringset) {
+	case ETH_SS_STATS:
+		memcpy(data, *ipoib_stats_keys,	sizeof(ipoib_stats_keys));
+		break;
+	}
+}
+
+static int ipoib_get_stats_count(struct net_device *dev)
+{
+	return ARRAY_SIZE(ipoib_stats_keys);
+}
+
+static void ipoib_get_ethtool_stats(struct net_device *dev,
+				struct ethtool_stats *stats, uint64_t *data)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int index = 0;
+
+	/* Get LRO statistics */
+	data[index++] = priv->lro.lro_mgr.stats.aggregated;
+	data[index++] = priv->lro.lro_mgr.stats.flushed;
+	if (priv->lro.lro_mgr.stats.flushed)
+		data[index++] = priv->lro.lro_mgr.stats.aggregated /
+				priv->lro.lro_mgr.stats.flushed;
+	else
+		data[index++] = 0;
+	data[index++] = priv->lro.lro_mgr.stats.no_desc;
+}
+
+static const struct ethtool_ops ipoib_ethtool_ops = {
+	.get_drvinfo		= ipoib_get_drvinfo,
+	.get_tx_csum		= ethtool_op_get_tx_csum,
+	.get_sg			= ethtool_op_get_sg,
+	.get_tso		= ethtool_op_get_tso,
+	.get_ufo		= ethtool_op_get_ufo,
+	.get_coalesce		= ipoib_get_coalesce,
+	.set_coalesce		= ipoib_set_coalesce,
+	.get_strings		= ipoib_get_strings,
+	.get_stats_count 	= ipoib_get_stats_count,
+	.get_ethtool_stats	= ipoib_get_ethtool_stats,
+};
+
+void ipoib_set_ethtool_ops(struct net_device *dev)
+{
+	SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops);
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_etool.c b/drivers/infiniband/ulp/ipoib/ipoib_etool.c
deleted file mode 100644
index 4f9211b..0000000
--- a/drivers/infiniband/ulp/ipoib/ipoib_etool.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * $Id: ipoib_etool.c  $
- */
-
-#include <linux/kernel.h>
-#include <linux/ethtool.h>
-#include <linux/netdevice.h>
-
-#include "ipoib.h"
-
-static void ipoib_get_drvinfo(struct net_device *netdev,
-			      struct ethtool_drvinfo *drvinfo)
-{
-	strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1);
-}
-
-static int ipoib_get_coalesce(struct net_device *dev,
-			      struct ethtool_coalesce *coal)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-
-	coal->rx_coalesce_usecs = priv->etool.coalesce_usecs;
-	coal->tx_coalesce_usecs = priv->etool.coalesce_usecs;
-	coal->rx_max_coalesced_frames = priv->etool.max_coalesced_frames;
-	coal->tx_max_coalesced_frames = priv->etool.max_coalesced_frames;
-
-	return 0;
-}
-
-static int ipoib_set_coalesce(struct net_device *dev,
-                             struct ethtool_coalesce *coal)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	int ret;
-
-	if (coal->rx_coalesce_usecs > 0xffff            ||
-	    coal->tx_coalesce_usecs > 0xffff            ||
-	    coal->rx_max_coalesced_frames > 0xffff      ||
-	    coal->tx_max_coalesced_frames > 0xffff)
-		return -EINVAL;
-
-	ret = ib_modify_cq(priv->rcq, coal->rx_max_coalesced_frames,
-	coal->rx_coalesce_usecs);
-	if (ret) {
-			ipoib_dbg(priv, "failed modifying CQ\n");
-			return ret;
-	}
-
-	coal->tx_coalesce_usecs = coal->rx_coalesce_usecs;
-	priv->etool.coalesce_usecs = coal->rx_coalesce_usecs;
-	coal->tx_max_coalesced_frames = coal->rx_max_coalesced_frames;
-	priv->etool.max_coalesced_frames = coal->rx_max_coalesced_frames;
-
-	return 0;
-}
-
-static const struct ethtool_ops ipoib_ethtool_ops = {
-	.get_drvinfo            = ipoib_get_drvinfo,
-	.get_tso		= ethtool_op_get_tso,
-	.get_coalesce           = ipoib_get_coalesce,
-	.set_coalesce           = ipoib_set_coalesce,
-};
-
-void ipoib_set_ethtool_ops(struct net_device *dev)
-{
-	SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops);
-}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
index 44c1741..33a1590 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_fs.c 1389 2004-12-27 22:56:47Z roland $
  */
 
 #include <linux/err.h>
@@ -124,7 +122,7 @@ static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
 	return 0;
 }
 
-static struct seq_operations ipoib_mcg_seq_ops = {
+static const struct seq_operations ipoib_mcg_seq_ops = {
 	.start = ipoib_mcg_seq_start,
 	.next  = ipoib_mcg_seq_next,
 	.stop  = ipoib_mcg_seq_stop,
@@ -136,7 +134,7 @@ static int ipoib_mcg_open(struct inode *inode, struct file *file)
 	struct seq_file *seq;
 	int ret;
 
-	ret = seq_open(file, &ipoib_mcg_seq_ops);
+	ret = seq_open(file, (struct seq_operations *) &ipoib_mcg_seq_ops);
 	if (ret)
 		return ret;
 
@@ -230,7 +228,7 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)
 	return 0;
 }
 
-static struct seq_operations ipoib_path_seq_ops = {
+static const struct seq_operations ipoib_path_seq_ops = {
 	.start = ipoib_path_seq_start,
 	.next  = ipoib_path_seq_next,
 	.stop  = ipoib_path_seq_stop,
@@ -242,7 +240,7 @@ static int ipoib_path_open(struct inode *inode, struct file *file)
 	struct seq_file *seq;
 	int ret;
 
-	ret = seq_open(file, &ipoib_path_seq_ops);
+	ret = seq_open(file, (struct seq_operations *) &ipoib_path_seq_ops);
 	if (ret)
 		return ret;
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_helper.c b/drivers/infiniband/ulp/ipoib/ipoib_helper.c
new file mode 100644
index 0000000..32699bb
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_helper.c
@@ -0,0 +1,63 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/neighbour.h>
+
+MODULE_AUTHOR("Eli Cohen");
+MODULE_DESCRIPTION("container for ipoib neighbour destructor");
+MODULE_LICENSE("Dual BSD/GPL");
+
+DEFINE_SPINLOCK(spl);
+static int busy;
+
+static void (*cleanup_func)(struct neighbour *n);
+
+static int ipoib_set_cleanup_function(void (*func)(struct neighbour *n))
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&spl, flags);
+	if (busy) {
+		spin_unlock_irqrestore(&spl, flags);
+		return -EBUSY;
+	}
+	cleanup_func = func;
+	spin_unlock_irqrestore(&spl, flags);
+
+	return 0;
+}
+
+static void ipoib_neigh_cleanup_container(struct neighbour *n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&spl, flags);
+	busy = 1;
+	spin_unlock_irqrestore(&spl, flags);
+	if (cleanup_func)
+		cleanup_func(n);
+
+	spin_lock_irqsave(&spl, flags);
+	busy = 0;
+	spin_unlock_irqrestore(&spl, flags);
+}
+
+
+EXPORT_SYMBOL(ipoib_set_cleanup_function);
+EXPORT_SYMBOL(ipoib_neigh_cleanup_container);
+
+
+static int __init ipoib_helper_init(void)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -1;
+
+	return 0;
+}
+
+
+static void __exit ipoib_helper_cleanup(void)
+{
+}
+
+module_init(ipoib_helper_init);
+module_exit(ipoib_helper_cleanup);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 2f760fb..2bef008 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -31,17 +31,15 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $
  */
 
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
 
 #include <rdma/ib_cache.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
 
 #include "ipoib.h"
 
@@ -90,82 +88,72 @@ void ipoib_free_ah(struct kref *kref)
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
-static void clean_pending_receives(struct ipoib_dev_priv *priv)
+static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
+				  u64 mapping[IPOIB_UD_RX_SG])
 {
-	int i;
-	int id;
-
-	for (i = 0; i < priv->rx_outst; ++i) {
-		id = priv->rx_wr_draft[i].wr_id & ~IPOIB_OP_RECV;
-		ipoib_sg_dma_unmap_rx(priv,
-				      priv->rx_ring[id].mapping);
-		dev_kfree_skb_any(priv->rx_ring[id].skb);
-		priv->rx_ring[id].skb = NULL;
-	}
-	priv->rx_outst = 0;
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
+				    DMA_FROM_DEVICE);
+		ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
+				  DMA_FROM_DEVICE);
+	} else
+		ib_dma_unmap_single(priv->ca, mapping[0],
+				    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+				    DMA_FROM_DEVICE);
 }
 
-static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, struct sk_buff *skb,
+static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
+				   struct sk_buff *skb,
 				   unsigned int length)
 {
 	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-	 	unsigned int size;
- 		skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
-
-	 	/* put header into skb */
- 		size = min(length, (unsigned)IPOIB_UD_HEAD_SIZE);
- 		skb->tail += size;
- 		skb->len += size;
- 		length -= size;
-
- 		size = min(length, (unsigned) PAGE_SIZE);
- 		frag->size = size;
- 		skb->data_len += size;
- 		skb->truesize += size;
- 		skb->len += size;
- 		length -= size;
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+		unsigned int size;
+		/*
+		 * There is only two buffers needed for max_payload = 4K,
+		 * first buf size is IPOIB_UD_HEAD_SIZE
+		 */
+		skb->tail += IPOIB_UD_HEAD_SIZE;
+		skb->len  += length;
+
+		size = length - IPOIB_UD_HEAD_SIZE;
+
+		frag->size     = size;
+		skb->data_len += size;
+		skb->truesize += size;
 	} else
 		skb_put(skb, length);
+
 }
 
 static int ipoib_ib_post_receive(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_recv_wr *bad_wr;
-	int ret = 0;
-	int i = priv->rx_outst;
-
-	priv->sglist_draft[i][0].addr = priv->rx_ring[id].mapping[0];
-	priv->sglist_draft[i][1].addr = priv->rx_ring[id].mapping[1];
-
-	priv->rx_wr_draft[i].wr_id = id | IPOIB_OP_RECV;
-
-	if (++priv->rx_outst == UD_POST_RCV_COUNT) {
-		ret = ib_post_recv(priv->qp, priv->rx_wr_draft, &bad_wr);
-
-		if (unlikely(ret)) {
-			ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
-			while (bad_wr) {
-				id = bad_wr->wr_id & ~IPOIB_OP_RECV;
-				ipoib_sg_dma_unmap_rx(priv,
-						      priv->rx_ring[id].mapping);
-				dev_kfree_skb_any(priv->rx_ring[id].skb);
-				priv->rx_ring[id].skb = NULL;
-				bad_wr = bad_wr->next;
-			}
-		}
-		priv->rx_outst = 0;
+	int ret;
+
+	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
+	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
+	priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
+
+
+	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
+		dev_kfree_skb_any(priv->rx_ring[id].skb);
+		priv->rx_ring[id].skb = NULL;
 	}
 
 	return ret;
 }
 
-static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id,
-					   u64 mapping[IPOIB_UD_RX_SG])
+static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
 	int buf_size;
+	u64 *mapping;
 
 	if (ipoib_ud_need_sg(priv->max_ib_mtu))
 		buf_size = IPOIB_UD_HEAD_SIZE;
@@ -173,9 +161,8 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id,
 		buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 
 	skb = dev_alloc_skb(buf_size + 4);
-
- 	if (unlikely(!skb))
- 		return NULL;
+	if (unlikely(!skb))
+		return NULL;
 
 	/*
 	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
@@ -184,32 +171,32 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id,
 	 */
 	skb_reserve(skb, 4);
 
- 	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
- 				       DMA_FROM_DEVICE);
- 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
- 		dev_kfree_skb_any(skb);
- 		return NULL;
- 	}
+	mapping = priv->rx_ring[id].mapping;
+	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
+				       DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
+		goto error;
 
 	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
 		struct page *page = alloc_page(GFP_ATOMIC);
-	 	if (!page)
- 			goto partial_error;
-
-	 	skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
- 		mapping[1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page,
- 					     0, PAGE_SIZE, DMA_FROM_DEVICE);
-	 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
- 			goto partial_error;
+		if (!page)
+			goto partial_error;
+		skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
+		mapping[1] =
+			ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page,
+					0, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
+			goto partial_error;
 	}
 
- 	priv->rx_ring[id].skb = skb;
- 	return skb;
+	priv->rx_ring[id].skb = skb;
+	return skb;
 
 partial_error:
 	ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
- 	dev_kfree_skb_any(skb);
- 	return NULL;
+error:
+	dev_kfree_skb_any(skb);
+	return NULL;
 }
 
 static int ipoib_ib_post_receives(struct net_device *dev)
@@ -218,7 +205,7 @@ static int ipoib_ib_post_receives(struct net_device *dev)
 	int i;
 
 	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (!ipoib_alloc_rx_skb(dev, i, priv->rx_ring[i].mapping)) {
+		if (!ipoib_alloc_rx_skb(dev, i)) {
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 			return -ENOMEM;
 		}
@@ -249,37 +236,42 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	skb  = priv->rx_ring[wr_id].skb;
 
-	/* duplicate the code here, to omit fast path if need-sg condition check */
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		if (wc->status != IB_WC_WR_FLUSH_ERR)
 			ipoib_warn(priv, "failed recv event "
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
-		ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
+		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
 		dev_kfree_skb_any(skb);
 		priv->rx_ring[wr_id].skb = NULL;
 		return;
 	}
+
 	/*
 	 * Drop packets that this interface sent, ie multicast packets
 	 * that the HCA has replicated.
 	 */
 	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
 		goto repost;
+
+	memcpy(mapping, priv->rx_ring[wr_id].mapping,
+	       IPOIB_UD_RX_SG * sizeof *mapping);
+
 	/*
 	 * If we can't allocate a new RX buffer, dump
 	 * this packet and reuse the old buffer.
 	 */
-	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id, mapping))) {
+	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
 		++priv->stats.rx_dropped;
 		goto repost;
 	}
+
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
-	ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
+
+	ipoib_ud_dma_unmap_rx(priv, mapping);
 	ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
-	memcpy(priv->rx_ring[wr_id].mapping, mapping,
-	       IPOIB_UD_RX_SG * sizeof *mapping);
+
 	skb_pull(skb, IB_GRH_BYTES);
 
 	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
@@ -294,19 +286,13 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	/* XXX get correct PACKET_ type here */
 	skb->pkt_type = PACKET_HOST;
 
-	/* check rx csum */
-	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) {
-		/*
-		 * Note: this is a specific requirement for Mellanox
-		 * HW but since it is the only HW currently supporting
-		 * checksum offload I put it here
-		 */
-		skb_reset_network_header(skb);
-		if (ip_hdr(skb)->ihl == 5)
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-	}
+	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	netif_receive_skb(skb);
+	if (dev->features & NETIF_F_LRO)
+		lro_receive_skb(&priv->lro.lro_mgr, skb, NULL);
+	else
+		netif_receive_skb(skb);
 
 repost:
 	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
@@ -314,9 +300,71 @@ repost:
 			   "for buf %d\n", wr_id);
 }
 
-static void _ipoib_ib_handle_tx_wc(struct net_device *dev, int wr_id)
+static int ipoib_dma_map_tx(struct ib_device *ca,
+			    struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i;
+	int off;
+
+	if (skb_headlen(skb)) {
+		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
+					       DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
+			return -EIO;
+
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		mapping[i + off] = ib_dma_map_page(ca, frag->page,
+						 frag->page_offset, frag->size,
+						 DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
+			goto partial_error;
+	}
+	return 0;
+
+partial_error:
+	for (; i > 0; --i) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
+		ib_dma_unmap_page(ca, mapping[i - !off], frag->size, DMA_TO_DEVICE);
+	}
+
+	if (off)
+		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+
+	return -EIO;
+}
+
+static void ipoib_dma_unmap_tx(struct ib_device *ca,
+			       struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i;
+	int off;
+
+	if (skb_headlen(skb)) {
+		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		ib_dma_unmap_page(ca, mapping[i + off], frag->size,
+				  DMA_TO_DEVICE);
+	}
+}
+
+static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id;
 	struct ipoib_tx_buf *tx_req;
 
 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
@@ -330,52 +378,35 @@ static void _ipoib_ib_handle_tx_wc(struct net_device *dev, int wr_id)
 
 	tx_req = &priv->tx_ring[wr_id];
 
-	if (tx_req->skb) {
-		ipoib_dma_unmap_tx(priv->ca, tx_req);
-		++priv->stats.tx_packets;
-		priv->stats.tx_bytes += tx_req->skb->len;
-		dev_kfree_skb_any(tx_req->skb);
-	}
+	ipoib_dma_unmap_tx(priv->ca, tx_req);
+
+	++priv->stats.tx_packets;
+	priv->stats.tx_bytes += tx_req->skb->len;
+
+	dev_kfree_skb_any(tx_req->skb);
+
 	++priv->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 	    netif_queue_stopped(dev) &&
 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		netif_wake_queue(dev);
-}
-
-static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	unsigned int wr_id = wc->wr_id;
-	int i;
-
-	i = priv->tx_poll;
-	do {
-		i &= (ipoib_sendq_size - 1);
-		_ipoib_ib_handle_tx_wc(dev, i);
-	} while (i++ != wr_id);
-	priv->tx_poll = i & (ipoib_sendq_size - 1);
-
-	if (unlikely(wc->status != IB_WC_SUCCESS &&
-		     wc->status != IB_WC_WR_FLUSH_ERR))
 
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR)
 		ipoib_warn(priv, "failed send event "
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
 }
 
-void poll_tx(struct ipoib_dev_priv *priv)
+static int poll_tx(struct ipoib_dev_priv *priv)
 {
 	int n, i;
 
-	while (1) {
-		n = ib_poll_cq(priv->scq, MAX_SEND_CQE, priv->send_wc);
-		for (i = 0; i < n; ++i)
-			ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
+	for (i = 0; i < n; ++i)
+		ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
 
-		if (n < MAX_SEND_CQE)
-			break;
-	}
+	return n == MAX_SEND_CQE;
 }
 
 int ipoib_poll(struct net_device *dev, int *budget)
@@ -393,7 +424,7 @@ poll_more:
 	while (max) {
 
 		t = min(IPOIB_NUM_WC, max);
-		n = ib_poll_cq(priv->rcq, t, priv->ibwc);
+		n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
 
 		for (i = 0; i < n; i++) {
 			struct ib_wc *wc = priv->ibwc + i;
@@ -406,7 +437,7 @@ poll_more:
 				else
 					ipoib_ib_handle_rx_wc(dev, wc);
 			} else
-                                ipoib_cm_handle_tx_wc(priv->dev, wc);
+				ipoib_cm_handle_tx_wc(priv->dev, wc);
 		}
 
 		if (n != t)
@@ -414,8 +445,11 @@ poll_more:
 	}
 
 	if (max) {
+		if (dev->features & NETIF_F_LRO)
+			lro_flush_all(&priv->lro.lro_mgr);
+
 		netif_rx_complete(dev);
-		if (unlikely(ib_req_notify_cq(priv->rcq,
+		if (unlikely(ib_req_notify_cq(priv->recv_cq,
 					      IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
 					      netif_rx_reschedule(dev, 0))
@@ -430,76 +464,30 @@ poll_more:
 	return ret;
 }
 
-void ipoib_ib_rx_completion(struct ib_cq *cq, void *dev_ptr)
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
 {
 	netif_rx_schedule(dev_ptr);
 }
 
-static inline int post_zlen_send_wr(struct ipoib_dev_priv *priv, unsigned wrid)
+static void drain_tx_cq(struct net_device *dev)
 {
-	struct ib_send_wr wr = {
-		.opcode = IB_WR_SEND,
-		.send_flags = IB_SEND_SIGNALED,
-		.wr_id = wrid,
-	};
-	struct ib_send_wr *bad_wr;
-
-	if (!priv->own_ah)
-		return -EBUSY;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	wr.wr.ud.ah = priv->own_ah;
-	wr.wr.ud.remote_qpn = priv->qp->qp_num;
-	wr.wr.ud.remote_qkey = priv->qkey;
-	return ib_post_send(priv->qp, &wr, &bad_wr);
-}
+	netif_tx_lock(dev);
+	while (poll_tx(priv))
+		; /* nothing */
 
-static void ipoib_ib_tx_timer_func(unsigned long dev_ptr)
-{
-	struct net_device *dev = (struct net_device *)dev_ptr;
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	unsigned long flags;
-	unsigned int wrid;
-
-	spin_lock_irqsave(&priv->tx_lock, flags);
-	if (((int)priv->tx_tail - (int)priv->tx_head < 0) &&
-		time_after(jiffies, dev->trans_start + 10) &&
-		priv->tx_outstanding < ipoib_sendq_size &&
-		priv->own_ah) {
-		wrid = priv->tx_head & (ipoib_sendq_size - 1);
-		priv->tx_ring[wrid].skb = NULL;
-		if (post_zlen_send_wr(priv, wrid))
-			ipoib_warn(priv, "failed to post zlen send\n");
-		else {
-			++priv->tx_head;
-			if (++priv->tx_outstanding == ipoib_sendq_size - 1) {
-				ipoib_dbg(priv, "%s: TX ring full, "
-					  "stopping kernel net queue\n", __func__);
-				netif_stop_queue(dev);
-			}
-		}
-	}
-	poll_tx(priv);
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
+	if (netif_queue_stopped(dev))
+		mod_timer(&priv->poll_timer, jiffies + 1);
 
-	mod_timer(&priv->poll_timer, jiffies + HZ / 2);
+	netif_tx_unlock(dev);
 }
 
-static void flush_tx_queue(struct ipoib_dev_priv *priv)
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
 {
-	unsigned long flags;
-	unsigned int wrid;
+	struct ipoib_dev_priv *priv = netdev_priv(dev_ptr);
 
-	spin_lock_irqsave(&priv->tx_lock, flags);
-	wrid = priv->tx_head & (ipoib_sendq_size - 1);
-	priv->tx_ring[wrid].skb = NULL;
-	if (!post_zlen_send_wr(priv, wrid)) {
-		++priv->tx_head;
-		++priv->tx_outstanding;
-	} else
-		ipoib_warn(priv, "post_zlen failed\n");
-
-	poll_tx(priv);
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
+	mod_timer(&priv->poll_timer, jiffies);
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,
@@ -526,23 +514,18 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 		priv->tx_sge[i + off].addr = mapping[i + off];
 		priv->tx_sge[i + off].length = frags[i].size;
 	}
-	priv->tx_wr.num_sge          = nr_frags + off;
+	priv->tx_wr.num_sge	     = nr_frags + off;
 	priv->tx_wr.wr_id 	     = wr_id;
 	priv->tx_wr.wr.ud.remote_qpn = qpn;
 	priv->tx_wr.wr.ud.ah 	     = address;
 
 	if (head) {
-		priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size;
+		priv->tx_wr.wr.ud.mss	 = skb_shinfo(skb)->gso_size;
 		priv->tx_wr.wr.ud.header = head;
-		priv->tx_wr.wr.ud.hlen = hlen;
-		priv->tx_wr.opcode      = IB_WR_LSO;
+		priv->tx_wr.wr.ud.hlen	 = hlen;
+		priv->tx_wr.opcode	 = IB_WR_LSO;
 	} else
-		priv->tx_wr.opcode      = IB_WR_SEND;
-
-	if (unlikely((priv->tx_head & (MAX_SEND_CQE - 1)) == MAX_SEND_CQE - 1))
-		priv->tx_wr.send_flags |= IB_SEND_SIGNALED;
-	else
-		priv->tx_wr.send_flags &= ~IB_SEND_SIGNALED;
+		priv->tx_wr.opcode	 = IB_WR_SEND;
 
 	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
 }
@@ -555,7 +538,17 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 	int hlen;
 	void *phead;
 
-	if (!skb_is_gso(skb)) {
+	if (skb_is_gso(skb)) {
+		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		phead = skb->data;
+		if (unlikely(!skb_pull(skb, hlen))) {
+			ipoib_warn(priv, "linear data too small\n");
+			++priv->stats.tx_dropped;
+			++priv->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return;
+		}
+	} else {
 		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
 			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
 				   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
@@ -564,24 +557,8 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
 			return;
 		}
-		phead = 0;
-		hlen = 0;
-	} else {
-		/*
-		 * LSO header is limited to max 60 bytes
-		 */
-		if (unlikely((ip_hdr(skb)->ihl + tcp_hdr(skb)->doff) > 15)) {
-			ipoib_warn(priv, "ip(%d) and tcp(%d) headers too long, dropping skb\n",
-				   ip_hdr(skb)->ihl << 2, tcp_hdr(skb)->doff << 2);
-			goto drop;
-		}
-
-		hlen = ((ip_hdr(skb)->ihl + tcp_hdr(skb)->doff) << 2) + IPOIB_ENCAP_LEN;
-		phead = skb->data;
-		if (unlikely(!skb_pull(skb, hlen))) {
-			ipoib_warn(priv, "linear data too small\n");
-			goto drop;
-		}
+		phead = NULL;
+		hlen  = 0;
 	}
 
 	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
@@ -602,19 +579,27 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		return;
 	}
 
-	if (priv->ca->flags & IB_DEVICE_IP_CSUM &&
-	    skb->ip_summed == CHECKSUM_PARTIAL)
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
 	else
 		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 
+	if (++priv->tx_outstanding == ipoib_sendq_size) {
+		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+			ipoib_warn(priv, "request notify on send CQ failed\n");
+		netif_stop_queue(dev);
+	}
+
 	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
-			       address->ah, qpn,
-			       tx_req, phead, hlen))) {
+			       address->ah, qpn, tx_req, phead, hlen))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++priv->stats.tx_errors;
+		--priv->tx_outstanding;
 		ipoib_dma_unmap_tx(priv->ca, tx_req);
 		dev_kfree_skb_any(skb);
+		if (netif_queue_stopped(dev))
+			netif_wake_queue(dev);
 	} else {
 		dev->trans_start = jiffies;
 
@@ -622,22 +607,11 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		++priv->tx_head;
 		skb_orphan(skb);
 
-		if (++priv->tx_outstanding == (ipoib_sendq_size - 1)) {
-			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
-			netif_stop_queue(dev);
-		}
 	}
 
-	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE + 1))
-		poll_tx(priv);
-
-
-	return;
-
-drop:
-	++priv->stats.tx_errors;
-	dev_kfree_skb_any(skb);
-	return;
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+		while (poll_tx(priv))
+			; /* nothing */
 }
 
 static void __ipoib_reap_ah(struct net_device *dev)
@@ -645,17 +619,20 @@ static void __ipoib_reap_ah(struct net_device *dev)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_ah *ah, *tah;
 	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
 
-	spin_lock_irq(&priv->tx_lock);
-	spin_lock(&priv->lock);
 	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
 		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
 			list_del(&ah->list);
 			ib_destroy_ah(ah->ah);
 			kfree(ah);
 		}
-	spin_unlock(&priv->lock);
-	spin_unlock_irq(&priv->tx_lock);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
 }
 
 void ipoib_reap_ah(struct work_struct *work)
@@ -671,6 +648,30 @@ void ipoib_reap_ah(struct work_struct *work)
 				   round_jiffies_relative(HZ));
 }
 
+static void ipoib_ah_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long begin;
+
+	begin = jiffies;
+
+	while (!list_empty(&priv->dead_ahs)) {
+		__ipoib_reap_ah(dev);
+
+		if (time_after(jiffies, begin + HZ)) {
+			ipoib_warn(priv, "timing out; will leak address handles\n");
+			break;
+		}
+
+		msleep(1);
+	}
+}
+
+static void ipoib_ib_tx_timer_func(unsigned long ctx)
+{
+	drain_tx_cq((struct net_device *)ctx);
+}
+
 int ipoib_ib_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -710,8 +711,6 @@ int ipoib_ib_dev_open(struct net_device *dev)
 	init_timer(&priv->poll_timer);
 	priv->poll_timer.function = ipoib_ib_tx_timer_func;
 	priv->poll_timer.data = (unsigned long)dev;
-        mod_timer(&priv->poll_timer, jiffies + HZ / 2);
-	set_bit(IPOIB_FLAG_TIME_ON, &priv->flags);
 
 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
@@ -789,8 +788,16 @@ void ipoib_drain_cq(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int i, n;
+
+	/*
+	 * We call completion handling routines that expect to be
+	 * called from the BH-disabled NAPI poll context, so disable
+	 * BHs here too.
+	 */
+	local_bh_disable();
+
 	do {
-		n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, priv->ibwc);
+		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
 		for (i = 0; i < n; ++i) {
 			/*
 			 * Convert any successful completions to flush
@@ -805,14 +812,15 @@ void ipoib_drain_cq(struct net_device *dev)
 					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
 				else
 					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
-			} else {
-				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
-					ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
-				else
-					ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
-			}
+			} else
+				ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
 		}
 	} while (n == IPOIB_NUM_WC);
+
+	while (poll_tx(priv))
+		; /* nothing */
+
+	local_bh_enable();
 }
 
 int ipoib_ib_dev_stop(struct net_device *dev, int flush)
@@ -820,20 +828,13 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_attr qp_attr;
 	unsigned long begin;
+	struct ipoib_tx_buf *tx_req;
 	int i;
-	unsigned long flags;
-	int timer_works;
-
-	timer_works = test_and_clear_bit(IPOIB_FLAG_TIME_ON, &priv->flags);
-	if (timer_works)
-		del_timer_sync(&priv->poll_timer);
 
 	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 	netif_poll_disable(dev);
 
 	ipoib_cm_dev_stop(dev);
-	if (timer_works)
-		flush_tx_queue(priv);
 
 	/*
 	 * Move our QP to the error state and then reinitialize in
@@ -843,7 +844,6 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
 
-	clean_pending_receives(priv);
 	/* Wait for all sends and receives to complete */
 	begin = jiffies;
 
@@ -856,13 +856,22 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 			 * assume the HW is wedged and just free up
 			 * all our pending work requests.
 			 */
+			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
+				tx_req = &priv->tx_ring[priv->tx_tail &
+							(ipoib_sendq_size - 1)];
+				ipoib_dma_unmap_tx(priv->ca, tx_req);
+				dev_kfree_skb_any(tx_req->skb);
+				++priv->tx_tail;
+				--priv->tx_outstanding;
+			}
+
 			for (i = 0; i < ipoib_recvq_size; ++i) {
-				struct ipoib_sg_rx_buf *rx_req;
+				struct ipoib_rx_buf *rx_req;
 
 				rx_req = &priv->rx_ring[i];
 				if (!rx_req->skb)
 					continue;
-				ipoib_sg_dma_unmap_rx(priv,
+				ipoib_ud_dma_unmap_rx(priv,
 						      priv->rx_ring[i].mapping);
 				dev_kfree_skb_any(rx_req->skb);
 				rx_req->skb = NULL;
@@ -871,12 +880,6 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 			goto timeout;
 		}
 
-		if ((int) priv->tx_tail - (int) priv->tx_head < 0) {
-			spin_lock_irqsave(&priv->tx_lock, flags);
-			poll_tx(priv);
-			spin_unlock_irqrestore(&priv->tx_lock, flags);
-		}
-
 		ipoib_drain_cq(dev);
 
 		msleep(1);
@@ -885,7 +888,12 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 	ipoib_dbg(priv, "All sends and receives done.\n");
 
 timeout:
-	destroy_own_ah(priv);
+	/* Make sure the timer was initialized */
+	if (priv->poll_timer.function) {
+		del_timer_sync(&priv->poll_timer);
+		memset(&priv->poll_timer, 0, sizeof priv->poll_timer);
+	}
+
 	qp_attr.qp_state = IB_QPS_RESET;
 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
@@ -896,21 +904,10 @@ timeout:
 	if (flush)
 		flush_workqueue(ipoib_workqueue);
 
-	begin = jiffies;
-
-	while (!list_empty(&priv->dead_ahs)) {
-		__ipoib_reap_ah(dev);
-
-		if (time_after(jiffies, begin + HZ)) {
-			ipoib_warn(priv, "timing out; will leak address handles\n");
-			break;
-		}
-
-		msleep(1);
-	}
+	ipoib_ah_dev_cleanup(dev);
 
 	netif_poll_enable(dev);
-	ib_req_notify_cq(priv->rcq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
 
 	return 0;
 }
@@ -938,7 +935,8 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 	return 0;
 }
 
-static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+				enum ipoib_flush_level level)
 {
 	struct ipoib_dev_priv *cpriv;
 	struct net_device *dev = priv->dev;
@@ -951,7 +949,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * the parent is down.
 	 */
 	list_for_each_entry(cpriv, &priv->child_intfs, list)
-		__ipoib_ib_dev_flush(cpriv, pkey_event);
+		__ipoib_ib_dev_flush(cpriv, level);
 
 	mutex_unlock(&priv->vlan_mutex);
 
@@ -965,7 +963,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		return;
 	}
 
-	if (pkey_event) {
+	if (level == IPOIB_FLUSH_HEAVY) {
 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 			ipoib_ib_dev_down(dev, 0);
@@ -983,43 +981,52 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		priv->pkey_index = new_index;
 	}
 
-	ipoib_dbg(priv, "flushing\n");
+	if (level == IPOIB_FLUSH_LIGHT) {
+		ipoib_mark_paths_invalid(dev);
+		ipoib_mcast_dev_flush(dev);
+	}
 
-	ipoib_ib_dev_down(dev, 0);
+	if (level >= IPOIB_FLUSH_NORMAL)
+		ipoib_ib_dev_down(dev, 0);
 
-	if (pkey_event) {
+	if (level == IPOIB_FLUSH_HEAVY) {
 		ipoib_ib_dev_stop(dev, 0);
 		ipoib_ib_dev_open(dev);
 	}
 
-	destroy_own_ah(priv);
-
 	/*
 	 * The device could have been brought down between the start and when
 	 * we get here, don't bring it back up if it's not configured up
 	 */
 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
-		ipoib_ib_dev_up(dev);
+		if (level >= IPOIB_FLUSH_NORMAL)
+			ipoib_ib_dev_up(dev);
 		ipoib_mcast_restart_task(&priv->restart_task);
 	}
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_light);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, flush_task);
+		container_of(work, struct ipoib_dev_priv, flush_normal);
 
-	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
-	__ipoib_ib_dev_flush(priv, 0);
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
 }
 
-void ipoib_pkey_event(struct work_struct *work)
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, pkey_event_task);
+		container_of(work, struct ipoib_dev_priv, flush_heavy);
 
-	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
-	__ipoib_ib_dev_flush(priv, 1);
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
 }
 
 void ipoib_ib_dev_cleanup(struct net_device *dev)
@@ -1031,6 +1038,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
 	ipoib_mcast_stop_thread(dev, 1);
 	ipoib_mcast_dev_flush(dev);
 
+	ipoib_ah_dev_cleanup(dev);
 	ipoib_transport_dev_cleanup(dev);
 }
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index f7028ff..53409b6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $
  */
 
 #include "ipoib.h"
@@ -41,6 +39,8 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
 
 #include <linux/if_arp.h>	/* For ARPHRD_xxx */
 
@@ -48,7 +48,6 @@
 #include <linux/in.h>
 
 #include <net/dst.h>
-#include <linux/vmalloc.h>
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
@@ -62,6 +61,15 @@ MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
 
+static int lro;
+module_param(lro, bool, 0444);
+MODULE_PARM_DESC(lro,  "Enable LRO (Large Receive Offload)");
+
+static int lro_max_aggr = IPOIB_LRO_MAX_AGGR;
+module_param(lro_max_aggr, int, 0644);
+MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated "
+		"(default = 64)");
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level;
 
@@ -107,7 +115,7 @@ int ipoib_open(struct net_device *dev)
 		return 0;
 
 	if (ipoib_ib_dev_open(dev))
-		return -EINVAL;
+  		return -EINVAL;
 
 	if (ipoib_ib_dev_up(dev)) {
 		ipoib_ib_dev_stop(dev, 1);
@@ -146,14 +154,8 @@ static int ipoib_stop(struct net_device *dev)
 
 	netif_stop_queue(dev);
 
-	/*
-	 * Now flush workqueue to make sure a scheduled task doesn't
-	 * bring our internal state back up.
-	 */
-	flush_workqueue(ipoib_workqueue);
-
-	ipoib_ib_dev_down(dev, 1);
-	ipoib_ib_dev_stop(dev, 1);
+	ipoib_ib_dev_down(dev, 0);
+	ipoib_ib_dev_stop(dev, 0);
 
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		struct ipoib_dev_priv *cpriv;
@@ -187,6 +189,7 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 		if (new_mtu > priv->mcast_mtu)
 			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 				   priv->mcast_mtu);
+
 		dev->mtu = new_mtu;
 		return 0;
 	}
@@ -198,7 +201,7 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 
 	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 
-	queue_work(ipoib_workqueue, &priv->flush_task);
+	queue_work(ipoib_workqueue, &priv->flush_light);
 
 	return 0;
 }
@@ -348,17 +351,34 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 
 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 
+void ipoib_mark_paths_invalid(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+
+	spin_lock_irq(&priv->lock);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+		ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n",
+			be16_to_cpu(path->pathrec.dlid),
+			IPOIB_GID_ARG(path->pathrec.dgid));
+		path->valid =  0;
+	}
+
+	spin_unlock_irq(&priv->lock);
+}
+
 void ipoib_flush_paths(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_path *path, *tp;
 	LIST_HEAD(remove_list);
+	unsigned long flags;
 
-	spin_lock_irq(&priv->tx_lock);
-	spin_lock(&priv->lock);
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
 
-	list_splice(&priv->path_list, &remove_list);
-	INIT_LIST_HEAD(&priv->path_list);
+	list_splice_init(&priv->path_list, &remove_list);
 
 	list_for_each_entry(path, &remove_list, list)
 		rb_erase(&path->rb_node, &priv->path_tree);
@@ -366,15 +386,16 @@ void ipoib_flush_paths(struct net_device *dev)
 	list_for_each_entry_safe(path, tp, &remove_list, list) {
 		if (path->query)
 			ib_sa_cancel_query(path->query_id, path->query);
-		spin_unlock(&priv->lock);
-		spin_unlock_irq(&priv->tx_lock);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
 		wait_for_completion(&path->done);
 		path_free(dev, path);
-		spin_lock_irq(&priv->tx_lock);
-		spin_lock(&priv->lock);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
 	}
-	spin_unlock(&priv->lock);
-	spin_unlock_irq(&priv->tx_lock);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
 }
 
 static void path_rec_completion(int status,
@@ -412,7 +433,7 @@ static void path_rec_completion(int status,
 	if (ah) {
 		path->pathrec = *pathrec;
 
-		old_ah = path->ah;
+		old_ah   = path->ah;
 		path->ah = ah;
 
 		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
@@ -455,6 +476,7 @@ static void path_rec_completion(int status,
 			while ((skb = __skb_dequeue(&neigh->queue)))
 				__skb_queue_tail(&skqueue, skb);
 		}
+		path->valid = 1;
 	}
 
 	path->query = NULL;
@@ -492,8 +514,8 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 	INIT_LIST_HEAD(&path->neigh_list);
 
 	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
-	path->pathrec.sgid          = priv->local_gid;
-	path->pathrec.pkey          = cpu_to_be16(priv->pkey);
+	path->pathrec.sgid	    = priv->local_gid;
+	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
 	path->pathrec.numb_path     = 1;
 	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 
@@ -505,38 +527,40 @@ static int path_rec_start(struct net_device *dev,
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU;
+	struct ib_sa_path_rec p_rec;
 
-	path->pathrec.mtu_selector = IB_SA_GT;
+	p_rec = path->pathrec;
+	p_rec.mtu_selector = IB_SA_GT;
 
 	switch (roundup_pow_of_two(dev->mtu + IPOIB_ENCAP_LEN)) {
 	case 512:
-		path->pathrec.mtu = IB_MTU_256;
+		p_rec.mtu = IB_MTU_256;
 		break;
 	case 1024:
-		path->pathrec.mtu = IB_MTU_512;
+		p_rec.mtu = IB_MTU_512;
 		break;
 	case 2048:
-		path->pathrec.mtu = IB_MTU_1024;
+		p_rec.mtu = IB_MTU_1024;
 		break;
 	case 4096:
-		path->pathrec.mtu = IB_MTU_2048;
+		p_rec.mtu = IB_MTU_2048;
 		break;
 	default:
 		/* Wildcard everything */
 		comp_mask = 0;
-		path->pathrec.mtu = 0;
-		path->pathrec.mtu_selector = 0;
+		p_rec.mtu = 0;
+		p_rec.mtu_selector = 0;
 	}
 
 	ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT " MTU > %d\n",
-		  IPOIB_GID_ARG(path->pathrec.dgid),
-		  comp_mask ? ib_mtu_enum_to_int(path->pathrec.mtu) : 0);
+		  IPOIB_GID_ARG(p_rec.dgid),
+		  comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0);
 
 	init_completion(&path->done);
 
 	path->query_id =
 		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
-				   &path->pathrec, comp_mask    |
+				   &p_rec, comp_mask		|
 				   IB_SA_PATH_REC_DGID		|
 				   IB_SA_PATH_REC_SGID		|
 				   IB_SA_PATH_REC_NUMB_PATH	|
@@ -546,8 +570,9 @@ static int path_rec_start(struct net_device *dev,
 				   path_rec_completion,
 				   path, &path->query);
 	if (path->query_id < 0) {
-		ipoib_warn(priv, "ib_sa_path_rec_get failed\n");
+		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
 		path->query = NULL;
+		complete(&path->done);
 		return path->query_id;
 	}
 
@@ -559,6 +584,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_path *path;
 	struct ipoib_neigh *neigh;
+	unsigned long flags;
 
 	neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
 	if (!neigh) {
@@ -567,11 +593,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 		return;
 	}
 
-	/*
-	 * We can only be called from ipoib_start_xmit, so we're
-	 * inside tx_lock -- no need to save/restore flags.
-	 */
-	spin_lock(&priv->lock);
+	spin_lock_irqsave(&priv->lock, flags);
 
 	path = __path_find(dev, skb->dst->neighbour->ha + 4);
 	if (!path) {
@@ -618,7 +640,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 		__skb_queue_tail(&neigh->queue, skb);
 	}
 
-	spin_unlock(&priv->lock);
+	spin_unlock_irqrestore(&priv->lock, flags);
 	return;
 
 err_list:
@@ -630,7 +652,7 @@ err_drop:
 	++priv->stats.tx_dropped;
 	dev_kfree_skb_any(skb);
 
-	spin_unlock(&priv->lock);
+	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev)
@@ -654,24 +676,26 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_path *path;
+	unsigned long flags;
 
-	/*
-	 * We can only be called from ipoib_start_xmit, so we're
-	 * inside tx_lock -- no need to save/restore flags.
-	 */
-	spin_lock(&priv->lock);
+	spin_lock_irqsave(&priv->lock, flags);
 
 	path = __path_find(dev, phdr->hwaddr + 4);
-	if (!path) {
-		path = path_rec_create(dev, phdr->hwaddr + 4);
+	if (!path || !path->valid) {
+		int new_path = 0;
+		if (!path) {
+			path = path_rec_create(dev, phdr->hwaddr + 4);
+			new_path = 1;
+		}
 		if (path) {
 			/* put pseudoheader back on for next time */
 			skb_push(skb, sizeof *phdr);
 			__skb_queue_tail(&path->queue, skb);
 
 			if (!path->query && path_rec_start(dev, path)) {
-				spin_unlock(&priv->lock);
-				path_free(dev, path);
+				spin_unlock_irqrestore(&priv->lock, flags);
+				if (new_path)
+					path_free(dev, path);
 				return;
 			} else
 				__path_add(dev, path);
@@ -680,7 +704,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 			dev_kfree_skb_any(skb);
 		}
 
-		spin_unlock(&priv->lock);
+		spin_unlock_irqrestore(&priv->lock, flags);
 		return;
 	}
 
@@ -699,7 +723,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 		dev_kfree_skb_any(skb);
 	}
 
-	spin_unlock(&priv->lock);
+	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -708,63 +732,49 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ipoib_neigh *neigh;
 	unsigned long flags;
 
-	if (unlikely(!spin_trylock_irqsave(&priv->tx_lock, flags)))
-		return NETDEV_TX_LOCKED;
-
-	/*
-	 * Check if our queue is stopped.  Since we have the LLTX bit
-	 * set, we can't rely on netif_stop_queue() preventing our
-	 * xmit function from being called with a full queue.
-	 */
-	if (unlikely(netif_queue_stopped(dev))) {
-		spin_unlock_irqrestore(&priv->tx_lock, flags);
-		return NETDEV_TX_BUSY;
-	}
-
 	if (likely(skb->dst && skb->dst->neighbour)) {
 		if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
 			ipoib_path_lookup(skb, dev);
-			goto out;
+			return NETDEV_TX_OK;
 		}
 
 		neigh = *to_ipoib_neigh(skb->dst->neighbour);
 
-		if (neigh->ah)
-			if (unlikely((memcmp(&neigh->dgid.raw,
-					    skb->dst->neighbour->ha + 4,
-					    sizeof(union ib_gid))) ||
-					 (neigh->dev != dev))) {
-				spin_lock(&priv->lock);
-				/*
-				 * It's safe to call ipoib_put_ah() inside
-				 * priv->lock here, because we know that
-				 * path->ah will always hold one more reference,
-				 * so ipoib_put_ah() will never do more than
-				 * decrement the ref count.
-				 */
+		if (unlikely((memcmp(&neigh->dgid.raw,
+				skb->dst->neighbour->ha + 4,
+				sizeof(union ib_gid))) ||
+				(neigh->dev != dev))) {
+			spin_lock_irqsave(&priv->lock, flags);
+			/*
+			 * It's safe to call ipoib_put_ah() inside
+			 * priv->lock here, because we know that
+			 * path->ah will always hold one more reference,
+			 * so ipoib_put_ah() will never do more than
+			 * decrement the ref count.
+			 */
+			if (neigh->ah)
 				ipoib_put_ah(neigh->ah);
-				list_del(&neigh->list);
-				ipoib_neigh_free(dev, neigh);
-				spin_unlock(&priv->lock);
-				ipoib_path_lookup(skb, dev);
-				goto out;
-			}
+			list_del(&neigh->list);
+			ipoib_neigh_free(dev, neigh);
+			spin_unlock_irqrestore(&priv->lock, flags);
+			ipoib_path_lookup(skb, dev);
+			return NETDEV_TX_OK;
+		}
 
 		if (ipoib_cm_get(neigh)) {
 			if (ipoib_cm_up(neigh)) {
 				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
-				goto out;
+				return NETDEV_TX_OK;
 			}
 		} else if (neigh->ah) {
-			ipoib_send(dev, skb, neigh->ah,
-				   IPOIB_QPN(skb->dst->neighbour->ha));
-			goto out;
+			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			return NETDEV_TX_OK;
 		}
 
 		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
-			spin_lock(&priv->lock);
+			spin_lock_irqsave(&priv->lock, flags);
 			__skb_queue_tail(&neigh->queue, skb);
-			spin_unlock(&priv->lock);
+			spin_unlock_irqrestore(&priv->lock, flags);
 		} else {
 			++priv->stats.tx_dropped;
 			dev_kfree_skb_any(skb);
@@ -793,16 +803,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 					   IPOIB_GID_RAW_ARG(phdr->hwaddr + 4));
 				dev_kfree_skb_any(skb);
 				++priv->stats.tx_dropped;
-				goto out;
+				return NETDEV_TX_OK;
 			}
 
 			unicast_arp_send(skb, dev, phdr);
 		}
 	}
 
-out:
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
-
 	return NETDEV_TX_OK;
 }
 
@@ -874,11 +881,9 @@ static void ipoib_neigh_cleanup(struct neighbour *n)
 		return;
 
 	neigh = *to_ipoib_neigh(n);
-	if (neigh) {
+	if (neigh)
 		priv = netdev_priv(neigh->dev);
-		ipoib_dbg(priv, "neigh_destructor for bonding device: %s\n",
-			  n->dev->name);
-	} else
+	else
 		return;
 	ipoib_dbg(priv,
 		  "neigh_cleanup for %06x " IPOIB_GID_FMT "\n",
@@ -932,75 +937,31 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
 
 static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
 {
-	parms->neigh_cleanup = ipoib_neigh_cleanup;
+	parms->neigh_cleanup = ipoib_neigh_cleanup_container;
 
 	return 0;
 }
 
-int ipoib_vmalloc(struct ipoib_vmap *buf, int size)
-{
-	int	i;
-	int	npages = ALIGN(size, PAGE_SIZE) / PAGE_SIZE;
-	int	ret = -ENOMEM;
-
-	buf->page_arr = kmalloc(npages * sizeof buf->page_arr[0], GFP_KERNEL);
-	if (!buf->page_arr)
-		goto out;
-
-	for (i = 0; i < npages; ++i) {
-		buf->page_arr[i] = alloc_page(GFP_KERNEL);
-		if (!buf->page_arr[i])
-			goto page_fail;
-	}
-
-	buf->npages = npages;
-	buf->ptr = vmap(buf->page_arr, buf->npages, VM_MAP, PAGE_KERNEL);
-	if (!buf->ptr)
-		goto page_fail;
-
-	memset(buf->ptr, 0, size);
-	return 0;
-
-page_fail:
-	for (; i > 0; --i)
-		__free_page(buf->page_arr[i - 1]);
-
-	kfree(buf->page_arr);
-out:
-	return ret;
-}
-
-void ipoib_vfree(struct ipoib_vmap *buf)
-{
-	int	i;
-
-	vunmap(buf->ptr);
-	for (i = 0; i < buf->npages; ++i)
-		__free_page(buf->page_arr[i]);
-
-	kfree(buf->page_arr);
-}
-
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	/* Allocate RX/TX "rings" to hold queued skbs */
-	if (ipoib_vmalloc(&priv->rx_vmap_ring, ipoib_recvq_size *
-			  sizeof *priv->rx_ring)) {
+	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
+				GFP_KERNEL);
+	if (!priv->rx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
 		       ca->name, ipoib_recvq_size);
 		goto out;
 	}
-        priv->rx_ring = priv->rx_vmap_ring.ptr;
 
-	if (ipoib_vmalloc(&priv->tx_vmap_ring, ipoib_sendq_size *
-			  sizeof *priv->tx_ring)) {
+	priv->tx_ring = vmalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
+	if (!priv->tx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
 		       ca->name, ipoib_sendq_size);
 		goto out_rx_ring_cleanup;
 	}
-	priv->tx_ring = priv->tx_vmap_ring.ptr;
+	memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
 
 	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
 
@@ -1010,10 +971,10 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 	return 0;
 
 out_tx_ring_cleanup:
-	ipoib_vfree(&priv->tx_vmap_ring);
+	vfree(priv->tx_ring);
 
 out_rx_ring_cleanup:
-	ipoib_vfree(&priv->rx_vmap_ring);
+	kfree(priv->rx_ring);
 
 out:
 	return -ENOMEM;
@@ -1034,44 +995,93 @@ void ipoib_dev_cleanup(struct net_device *dev)
 
 	ipoib_ib_dev_cleanup(dev);
 
-	ipoib_vfree(&priv->rx_vmap_ring);
-	ipoib_vfree(&priv->tx_vmap_ring);
+	kfree(priv->rx_ring);
+	vfree(priv->tx_ring);
 
 	priv->rx_ring = NULL;
 	priv->tx_ring = NULL;
 }
 
+static int get_skb_hdr(struct sk_buff *skb, void **iphdr,
+		       void **tcph, u64 *hdr_flags, void *priv)
+{
+	unsigned int ip_len;
+	struct iphdr *iph;
+
+	if (unlikely(skb->protocol != htons(ETH_P_IP)))
+		return -1;
+
+	/*
+	 * In the future we may add an else clause that verifies the
+	 * checksum and allows devices which do not calculate checksum
+	 * to use LRO.
+	 */
+	if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
+		return -1;
+
+	/* Check for non-TCP packet */
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	if (iph->protocol != IPPROTO_TCP)
+		return -1;
+
+	ip_len = ip_hdrlen(skb);
+	skb_set_transport_header(skb, ip_len);
+	*tcph = tcp_hdr(skb);
+
+	/* check if IP header and TCP header are complete */
+	if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb))
+		return -1;
+
+	*hdr_flags = LRO_IPV4 | LRO_TCP;
+	*iphdr = iph;
+
+	return 0;
+}
+
+static void ipoib_lro_setup(struct ipoib_dev_priv *priv)
+{
+	priv->lro.lro_mgr.max_aggr	 = lro_max_aggr;
+	priv->lro.lro_mgr.max_desc	 = IPOIB_MAX_LRO_DESCRIPTORS;
+	priv->lro.lro_mgr.lro_arr	 = priv->lro.lro_desc;
+	priv->lro.lro_mgr.get_skb_header = get_skb_hdr;
+	priv->lro.lro_mgr.features	 = LRO_F_NAPI;
+	priv->lro.lro_mgr.dev		 = priv->dev;
+	priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
+}
+
 static void ipoib_setup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	dev->open 		 = ipoib_open;
-	dev->stop 		 = ipoib_stop;
-	dev->change_mtu 	 = ipoib_change_mtu;
-	dev->hard_start_xmit 	 = ipoib_start_xmit;
+	dev->open		 = ipoib_open;
+	dev->stop		 = ipoib_stop;
+	dev->change_mtu		 = ipoib_change_mtu;
+	dev->hard_start_xmit	 = ipoib_start_xmit;
 	dev->get_stats 		 = ipoib_get_stats;
-	dev->tx_timeout 	 = ipoib_timeout;
+	dev->tx_timeout		 = ipoib_timeout;
 	dev->hard_header         = ipoib_hard_header;
-	dev->set_multicast_list  = ipoib_set_mcast_list;
-	dev->neigh_setup         = ipoib_neigh_setup_dev;
+	dev->set_multicast_list	 = ipoib_set_mcast_list;
+	dev->neigh_setup	 = ipoib_neigh_setup_dev;
 	ipoib_set_ethtool_ops(dev);
 	dev->poll                = ipoib_poll;
 	dev->weight              = 100;
 
 
-	dev->watchdog_timeo 	 = HZ;
+	dev->watchdog_timeo	 = HZ;
 
-	dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
+	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
 
 	/*
 	 * We add in INFINIBAND_ALEN to allow for the destination
 	 * address "pseudoheader" for skbs without neighbour struct.
 	 */
-	dev->hard_header_len 	 = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
-	dev->addr_len 		 = INFINIBAND_ALEN;
-	dev->type 		 = ARPHRD_INFINIBAND;
-	dev->tx_queue_len 	 = ipoib_sendq_size * 2;
-	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
+	dev->hard_header_len	 = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
+	dev->addr_len		 = INFINIBAND_ALEN;
+	dev->type		 = ARPHRD_INFINIBAND;
+	dev->tx_queue_len	 = ipoib_sendq_size * 2;
+	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
+				    NETIF_F_HIGHDMA);
 
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
@@ -1079,10 +1089,10 @@ static void ipoib_setup(struct net_device *dev)
 
 	priv->dev = dev;
 
+	ipoib_lro_setup(priv);
+
 	spin_lock_init(&priv->lock);
-	spin_lock_init(&priv->tx_lock);
 
-	mutex_init(&priv->mcast_mutex);
 	mutex_init(&priv->vlan_mutex);
 
 	INIT_LIST_HEAD(&priv->path_list);
@@ -1091,9 +1101,11 @@ static void ipoib_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&priv->multicast_list);
 
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
-	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
-	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
+	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
+	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
 }
@@ -1197,24 +1209,11 @@ int ipoib_add_pkey_attr(struct net_device *dev)
 					&class_device_attr_pkey);
 }
 
-static void set_csum(struct net_device *dev, struct ib_device *hca)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-
-	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
-		return;
-
-	if (!(hca->flags & IB_DEVICE_IP_CSUM))
-		return;
-
-	dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
-	set_bit(IPOIB_FLAG_CSUM, &priv->flags);
-}
-
 static struct net_device *ipoib_add_port(const char *format,
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
+	struct ib_device_attr *device_attr;
 	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
@@ -1224,8 +1223,6 @@ static struct net_device *ipoib_add_port(const char *format,
 
 	SET_NETDEV_DEV(priv->dev, hca->dma_device);
 
-	priv->dev->features |= NETIF_F_HIGHDMA;
-
 	if (!ib_query_port(hca, port, &attr))
 		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
 	else {
@@ -1245,6 +1242,32 @@ static struct net_device *ipoib_add_port(const char *format,
 		goto device_init_failed;
 	}
 
+	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
+	if (!device_attr) {
+		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
+		       hca->name, sizeof *device_attr);
+		goto device_init_failed;
+	}
+
+	result = ib_query_device(hca, device_attr);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
+		       hca->name, result);
+		kfree(device_attr);
+		goto device_init_failed;
+	}
+	priv->hca_caps = device_attr->device_cap_flags;
+
+	kfree(device_attr);
+
+	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
+		set_bit(IPOIB_FLAG_CSUM, &priv->flags);
+		priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
+	}
+
+	if (lro)
+		priv->dev->features |= NETIF_F_LRO;
+
 	/*
 	 * Set the full membership bit, so that we join the right
 	 * broadcast group, etc.
@@ -1262,8 +1285,6 @@ static struct net_device *ipoib_add_port(const char *format,
 	} else
 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
 
-	set_csum(priv->dev, hca);
-
 	result = ipoib_dev_init(priv->dev, hca, port);
 	if (result < 0) {
 		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
@@ -1281,10 +1302,9 @@ static struct net_device *ipoib_add_port(const char *format,
 		goto event_failed;
 	}
 
-	if (priv->dev->features & NETIF_F_SG && priv->ca->flags & IB_DEVICE_TCP_TSO)
+	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO)
 		priv->dev->features |= NETIF_F_TSO;
 
-
 	result = register_netdev(priv->dev);
 	if (result) {
 		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
@@ -1315,7 +1335,7 @@ sysfs_failed:
 
 register_failed:
 	ib_unregister_event_handler(&priv->event_handler);
-	flush_scheduled_work();
+	flush_workqueue(ipoib_workqueue);
 
 event_failed:
 	ipoib_dev_cleanup(priv->dev);
@@ -1374,7 +1394,12 @@ static void ipoib_remove_one(struct ib_device *device)
 
 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
 		ib_unregister_event_handler(&priv->event_handler);
-		flush_scheduled_work();
+
+		rtnl_lock();
+		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
+		rtnl_unlock();
+
+		flush_workqueue(ipoib_workqueue);
 
 		unregister_netdev(priv->dev);
 		ipoib_dev_cleanup(priv->dev);
@@ -1394,14 +1419,25 @@ static int __init ipoib_init_module(void)
 
 	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
-	ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE);
+	ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
+						     IPOIB_MIN_QUEUE_SIZE));
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
 #endif
 
+	/*
+	 * When copying small received packets, we only copy from the
+	 * linear data part of the SKB, so we rely on this condition.
+	 */
+	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
+
+
+	ipoib_set_cleanup_function(ipoib_neigh_cleanup);
 	ret = ipoib_register_debugfs();
-	if (ret)
+	if (ret) {
+		ipoib_set_cleanup_function(NULL);
 		return ret;
+	}
 
 	/*
 	 * We create our own workqueue mainly because we want to be
@@ -1413,6 +1449,7 @@ static int __init ipoib_init_module(void)
 	 */
 	ipoib_workqueue = create_singlethread_workqueue("ipoib");
 	if (!ipoib_workqueue) {
+		ipoib_set_cleanup_function(NULL);
 		ret = -ENOMEM;
 		goto err_fs;
 	}
@@ -1420,8 +1457,10 @@ static int __init ipoib_init_module(void)
 	ib_sa_register_client(&ipoib_sa_client);
 
 	ret = ib_register_client(&ipoib_client);
-	if (ret)
+	if (ret) {
+		ipoib_set_cleanup_function(NULL);
 		goto err_sa;
+	}
 
 	return 0;
 
@@ -1437,7 +1476,16 @@ err_fs:
 
 static void __exit ipoib_cleanup_module(void)
 {
+	int ret;
+
 	ib_unregister_client(&ipoib_client);
+
+	do {
+		ret = ipoib_set_cleanup_function(NULL);
+		if (ret)
+			msleep(10);
+	} while(ret);
+
 	ib_sa_unregister_client(&ipoib_sa_client);
 	ipoib_unregister_debugfs();
 	destroy_workqueue(ipoib_workqueue);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index b95e504..dc5f61d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -30,8 +30,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_multicast.c 1362 2004-12-18 15:56:29Z roland $
  */
 
 #include <linux/skbuff.h>
@@ -71,14 +69,13 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 	struct net_device *dev = mcast->dev;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_neigh *neigh, *tmp;
-	unsigned long flags;
 	int tx_dropped = 0;
 
 	ipoib_dbg_mcast(netdev_priv(dev),
 			"deleting multicast group " IPOIB_GID_FMT "\n",
 			IPOIB_GID_ARG(mcast->mcmember.mgid));
 
-	spin_lock_irqsave(&priv->lock, flags);
+	spin_lock_irq(&priv->lock);
 
 	list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
 		/*
@@ -92,7 +89,7 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 		ipoib_neigh_free(dev, neigh);
 	}
 
-	spin_unlock_irqrestore(&priv->lock, flags);
+	spin_unlock_irq(&priv->lock);
 
 	if (mcast->ah)
 		ipoib_put_ah(mcast->ah);
@@ -102,9 +99,9 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 		dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
 	}
 
-	spin_lock_irqsave(&priv->tx_lock, flags);
+	netif_tx_lock_bh(dev);
 	priv->stats.tx_dropped += tx_dropped;
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
+	netif_tx_unlock_bh(dev);
 
 	kfree(mcast);
 }
@@ -188,6 +185,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_ah *ah;
 	int ret;
+	int set_qkey = 0;
 
 	mcast->mcmember = *mcmember;
 
@@ -202,6 +200,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
 		spin_unlock_irq(&priv->lock);
 		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+		set_qkey = 1;
 	}
 
 	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@@ -214,7 +213,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 		}
 
 		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
-					 &mcast->mcmember.mgid);
+					 &mcast->mcmember.mgid, set_qkey);
 		if (ret < 0) {
 			ipoib_warn(priv, "couldn't attach QP to multicast group "
 				   IPOIB_GID_FMT "\n",
@@ -259,10 +258,10 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 	}
 
 	/* actually send any queued packets */
-	spin_lock_irq(&priv->tx_lock);
+	netif_tx_lock_bh(dev);
 	while (!skb_queue_empty(&mcast->pkt_queue)) {
 		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
-		spin_unlock_irq(&priv->tx_lock);
+		netif_tx_unlock_bh(dev);
 
 		skb->dev = dev;
 
@@ -273,9 +272,9 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 
 		if (dev_queue_xmit(skb))
 			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
-		spin_lock_irq(&priv->tx_lock);
+		netif_tx_lock_bh(dev);
 	}
-	spin_unlock_irq(&priv->tx_lock);
+	netif_tx_unlock_bh(dev);
 
 	return 0;
 }
@@ -302,12 +301,12 @@ ipoib_mcast_sendonly_join_complete(int status,
 					IPOIB_GID_ARG(mcast->mcmember.mgid), status);
 
 		/* Flush out any queued packets */
-		spin_lock_irq(&priv->tx_lock);
+		netif_tx_lock_bh(dev);
 		while (!skb_queue_empty(&mcast->pkt_queue)) {
 			++priv->stats.tx_dropped;
 			dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
 		}
-		spin_unlock_irq(&priv->tx_lock);
+		netif_tx_unlock_bh(dev);
 
 		/* Clear the busy flag so we try again */
 		status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
@@ -366,6 +365,21 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
 	return ret;
 }
 
+void ipoib_mcast_carrier_on_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   carrier_on_task);
+
+	/*
+	 * Take rtnl_lock to avoid racing with ipoib_stop() and
+	 * turning the carrier back on while a device is being
+	 * removed.
+	 */
+	rtnl_lock();
+	netif_carrier_on(priv->dev);
+	rtnl_unlock();
+}
+
 static int ipoib_mcast_join_complete(int status,
 				     struct ib_sa_multicast *multicast)
 {
@@ -392,14 +406,18 @@ static int ipoib_mcast_join_complete(int status,
 					   &priv->mcast_task, 0);
 		mutex_unlock(&mcast_mutex);
 
+		/*
+		 * Defer carrier on work to ipoib_workqueue to avoid a
+		 * deadlock on rtnl_lock here.
+		 */
 		if (mcast == priv->broadcast)
-			netif_carrier_on(dev);
+			queue_work(ipoib_workqueue, &priv->carrier_on_task);
 
 		return 0;
 	}
 
 	if (mcast->logcount++ < 20) {
-		if (status == -ETIMEDOUT) {
+		if (status == -ETIMEDOUT || status == -EAGAIN) {
 			ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT
 					", status %d\n",
 					IPOIB_GID_ARG(mcast->mcmember.mgid),
@@ -498,42 +516,6 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 	}
 }
 
-static int create_own_ah(struct ipoib_dev_priv *priv)
-{
-	struct ib_ah_attr attr = {
-		.dlid = priv->local_lid,
-		.port_num = priv->port,
-	};
-        struct ib_ah *ah;
-
-	if (priv->own_ah)
-		return 0;
-
-	ah = ib_create_ah(priv->pd, &attr);
-	if (!IS_ERR(ah)) {
-		ipoib_dbg(priv, "created own ah\n");
-		priv->own_ah = ah;
-	}
-
-	return IS_ERR(priv->own_ah);
-}
-
-void destroy_own_ah(struct ipoib_dev_priv *priv)
-{
-	unsigned long flags;
-
-	if (!priv->own_ah) {
-		ipoib_dbg(priv, "own ah already destroyed\n");
-		return;
-	} else
-		ipoib_dbg(priv, "destroying own ah\n");
-
-	spin_lock_irqsave(&priv->tx_lock, flags);
-	ib_destroy_ah(priv->own_ah);
-	priv->own_ah = NULL;
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
-}
-
 void ipoib_mcast_join_task(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
@@ -551,11 +533,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
 	{
 		struct ib_port_attr attr;
 
-		if (!ib_query_port(priv->ca, priv->port, &attr)) {
+		if (!ib_query_port(priv->ca, priv->port, &attr))
 			priv->local_lid = attr.lid;
-			if (create_own_ah(priv))
-				ipoib_warn(priv, "create own_ah failed\n");
-		}
 		else
 			ipoib_warn(priv, "ib_query_port failed\n");
 	}
@@ -563,6 +542,9 @@ void ipoib_mcast_join_task(struct work_struct *work)
 	if (!priv->broadcast) {
 		struct ipoib_mcast *broadcast;
 
+		if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			return;
+
 		broadcast = ipoib_mcast_alloc(dev, 1);
 		if (!broadcast) {
 			ipoib_warn(priv, "failed to allocate broadcast group\n");
@@ -583,8 +565,10 @@ void ipoib_mcast_join_task(struct work_struct *work)
 		spin_unlock_irq(&priv->lock);
 	}
 
-	if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
-		if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
+	if (priv->broadcast &&
+	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		if (priv->broadcast &&
+		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
 			ipoib_mcast_join(dev, priv->broadcast, 0);
 		return;
 	}
@@ -612,10 +596,18 @@ void ipoib_mcast_join_task(struct work_struct *work)
 		return;
 	}
 
-	priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+	spin_lock_irq(&priv->lock);
+	if (priv->broadcast)
+		priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+	else
+		priv->mcast_mtu = priv->admin_mtu;
+	spin_unlock_irq(&priv->lock);
 
-	if (!ipoib_cm_admin_enabled(dev))
-		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+	if (!ipoib_cm_admin_enabled(dev)) {
+		rtnl_lock();
+		dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
+		rtnl_unlock();
+	}
 
 	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
 
@@ -633,10 +625,6 @@ int ipoib_mcast_start_thread(struct net_device *dev)
 		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
 	mutex_unlock(&mcast_mutex);
 
-	spin_lock_irq(&priv->lock);
-	set_bit(IPOIB_MCAST_STARTED, &priv->flags);
-	spin_unlock_irq(&priv->lock);
-
 	return 0;
 }
 
@@ -646,10 +634,6 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
 
 	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
 
-	spin_lock_irq(&priv->lock);
-	clear_bit(IPOIB_MCAST_STARTED, &priv->flags);
-	spin_unlock_irq(&priv->lock);
-
 	mutex_lock(&mcast_mutex);
 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
 	cancel_delayed_work(&priv->mcast_task);
@@ -674,10 +658,10 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
 				IPOIB_GID_ARG(mcast->mcmember.mgid));
 
 		/* Remove ourselves from the multicast group */
-		ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
-					 &mcast->mcmember.mgid);
+		ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
+				      be16_to_cpu(mcast->mcmember.mlid));
 		if (ret)
-			ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
+			ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
 	}
 
 	return 0;
@@ -687,12 +671,9 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_mcast *mcast;
+	unsigned long flags;
 
-	/*
-	 * We can only be called from ipoib_start_xmit, so we're
-	 * inside tx_lock -- no need to save/restore flags.
-	 */
-	spin_lock(&priv->lock);
+	spin_lock_irqsave(&priv->lock, flags);
 
 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)		||
 	    !priv->broadcast					||
@@ -746,7 +727,7 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 
 out:
 	if (mcast && mcast->ah) {
-		if (skb->dst            &&
+		if (skb->dst		&&
 		    skb->dst->neighbour &&
 		    !*to_ipoib_neigh(skb->dst->neighbour)) {
 			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour,
@@ -754,7 +735,7 @@ out:
 
 			if (neigh) {
 				kref_get(&mcast->ah->ref);
-				neigh->ah  	= mcast->ah;
+				neigh->ah	= mcast->ah;
 				list_add_tail(&neigh->list, &mcast->neigh_list);
 			}
 		}
@@ -763,7 +744,7 @@ out:
 	}
 
 unlock:
-	spin_unlock(&priv->lock);
+	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 void ipoib_mcast_dev_flush(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index e5ab3f9..f56797e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -29,25 +29,18 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include "ipoib.h"
 #include <linux/ethtool.h>
 
-int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ib_qp_attr *qp_attr;
+	struct ib_qp_attr *qp_attr = NULL;
 	int ret;
 	u16 pkey_index;
 
-	ret = -ENOMEM;
-	qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
-	if (!qp_attr)
-		goto out;
-
 	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 		ret = -ENXIO;
@@ -55,18 +48,23 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
 	}
 	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 
-	/* set correct QKey for QP */
-	qp_attr->qkey = priv->qkey;
-	ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
-	if (ret) {
-		ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
-		goto out;
+	if (set_qkey) {
+		ret = -ENOMEM;
+		qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+		if (!qp_attr)
+			goto out;
+
+		/* set correct QKey for QP */
+		qp_attr->qkey = priv->qkey;
+		ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
+		if (ret) {
+			ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+			goto out;
+		}
 	}
 
 	/* attach QP to multicast group */
-	mutex_lock(&priv->mcast_mutex);
 	ret = ib_attach_mcast(priv->qp, mgid, mlid);
-	mutex_unlock(&priv->mcast_mutex);
 	if (ret)
 		ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
 
@@ -75,20 +73,6 @@ out:
 	return ret;
 }
 
-int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	int ret;
-
-	mutex_lock(&priv->mcast_mutex);
-	ret = ib_detach_mcast(priv->qp, mgid, mlid);
-	mutex_unlock(&priv->mcast_mutex);
-	if (ret)
-		ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
-
-	return ret;
-}
-
 int ipoib_init_qp(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -150,15 +134,15 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 		.cap = {
 			.max_send_wr  = ipoib_sendq_size,
 			.max_recv_wr  = ipoib_recvq_size,
-			.max_send_sge = dev->features & NETIF_F_SG ? MAX_SKB_FRAGS + 1 : 1,
+			.max_send_sge = 1,
 			.max_recv_sge = IPOIB_UD_RX_SG
 		},
-		.sq_sig_type = IB_SIGNAL_REQ_WR,
-		.qp_type     = IB_QPT_UD,
-		.create_flags = QP_CREATE_LSO,
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type     = IB_QPT_UD
 	};
 
-	int i, ret, size;
+	int ret, size;
+	int i;
 	struct ethtool_coalesce *coal;
 
 	priv->pd = ib_alloc_pd(priv->ca);
@@ -173,7 +157,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 		goto out_free_pd;
 	}
 
-	size = ipoib_recvq_size;
+	size = ipoib_recvq_size + 1;
 	ret = ipoib_cm_dev_init(dev);
 	if (!ret) {
 		size += ipoib_sendq_size;
@@ -183,37 +167,48 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 			size += ipoib_recvq_size * ipoib_max_conn_qp;
 	}
 
-	priv->rcq = ib_create_cq(priv->ca, ipoib_ib_rx_completion, NULL, dev, size, 0);
-	if (IS_ERR(priv->rcq)) {
+	priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
+	if (IS_ERR(priv->recv_cq)) {
 		printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
 		goto out_free_mr;
 	}
 
-	priv->scq = ib_create_cq(priv->ca, NULL, NULL, dev, ipoib_sendq_size, 0);
-	if (IS_ERR(priv->scq)) {
+	priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
+				     dev, ipoib_sendq_size, 0);
+	if (IS_ERR(priv->send_cq)) {
 		printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
-		goto out_free_rcq;
+		goto out_free_recv_cq;
 	}
 
+	if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+		goto out_free_send_cq;
 
 	coal = kzalloc(sizeof *coal, GFP_KERNEL);
 	if (coal) {
 		coal->rx_coalesce_usecs = 10;
+		coal->tx_coalesce_usecs = 10;
 		coal->rx_max_coalesced_frames = 16;
+		coal->tx_max_coalesced_frames = 16;
 		dev->ethtool_ops->set_coalesce(dev, coal);
 		kfree(coal);
 	}
 
-	if (ib_req_notify_cq(priv->rcq, IB_CQ_NEXT_COMP))
-		goto out_free_scq;
+	init_attr.send_cq = priv->send_cq;
+	init_attr.recv_cq = priv->recv_cq;
 
-	init_attr.send_cq = priv->scq;
-	init_attr.recv_cq = priv->rcq;
+	if (priv->hca_caps & IB_DEVICE_UD_TSO)
+		init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+		init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (dev->features & NETIF_F_SG)
+		init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
 
 	priv->qp = ib_create_qp(priv->pd, &init_attr);
 	if (IS_ERR(priv->qp)) {
 		printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
-		goto out_free_rcq;
+		goto out_free_send_cq;
 	}
 
 	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
@@ -221,42 +216,32 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
 
 	for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
-		priv->tx_sge[i].lkey    = priv->mr->lkey;
-
-	priv->tx_wr.opcode 	= IB_WR_SEND;
-	priv->tx_wr.sg_list 	= priv->tx_sge;
-	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
-
-	for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
-		priv->sglist_draft[i][0].lkey = priv->mr->lkey;
-		priv->sglist_draft[i][1].lkey = priv->mr->lkey;
-		priv->rx_wr_draft[i].sg_list = &priv->sglist_draft[i][0];
-		if (i < UD_POST_RCV_COUNT - 1)
-			priv->rx_wr_draft[i].next = &priv->rx_wr_draft[i + 1];
-		else
-			priv->rx_wr_draft[i].next = NULL;
-	}
+		priv->tx_sge[i].lkey = priv->mr->lkey;
+
+	priv->tx_wr.opcode	= IB_WR_SEND;
+	priv->tx_wr.sg_list	= priv->tx_sge;
+	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;
 
+	priv->rx_sge[0].lkey = priv->mr->lkey;
 	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-		for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
-			priv->sglist_draft[i][0].length = IPOIB_UD_HEAD_SIZE;
-			priv->sglist_draft[i][1].length = PAGE_SIZE;
-			priv->rx_wr_draft[i].num_sge = IPOIB_UD_RX_SG;
-		}
+		priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE;
+		priv->rx_sge[1].length = PAGE_SIZE;
+		priv->rx_sge[1].lkey = priv->mr->lkey;
+		priv->rx_wr.num_sge = IPOIB_UD_RX_SG;
 	} else {
-		for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
-			priv->sglist_draft[i][0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
-			priv->rx_wr_draft[i].num_sge = 1;
-		}
+		priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+		priv->rx_wr.num_sge = 1;
 	}
+	priv->rx_wr.next = NULL;
+	priv->rx_wr.sg_list = priv->rx_sge;
 
 	return 0;
 
-out_free_scq:
-	ib_destroy_cq(priv->scq);
+out_free_send_cq:
+	ib_destroy_cq(priv->send_cq);
 
-out_free_rcq:
-	ib_destroy_cq(priv->rcq);
+out_free_recv_cq:
+	ib_destroy_cq(priv->recv_cq);
 
 out_free_mr:
 	ib_dereg_mr(priv->mr);
@@ -279,11 +264,11 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 	}
 
-	if (ib_destroy_cq(priv->scq))
-		ipoib_warn(priv, "ib_cq_destroy failed\n");
+	if (ib_destroy_cq(priv->send_cq))
+		ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
 
-	if (ib_destroy_cq(priv->rcq))
-		ipoib_warn(priv, "ib_cq_destroy failed\n");
+	if (ib_destroy_cq(priv->recv_cq))
+		ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
 
 	ipoib_cm_dev_cleanup(dev);
 
@@ -303,15 +288,17 @@ void ipoib_event(struct ib_event_handler *handler,
 	if (record->element.port_num != priv->port)
 		return;
 
-	if (record->event == IB_EVENT_PORT_ERR    ||
-	    record->event == IB_EVENT_PORT_ACTIVE ||
-	    record->event == IB_EVENT_LID_CHANGE  ||
-	    record->event == IB_EVENT_SM_CHANGE   ||
+	ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+		  record->device->name, record->element.port_num);
+
+	if (record->event == IB_EVENT_SM_CHANGE ||
 	    record->event == IB_EVENT_CLIENT_REREGISTER) {
-		ipoib_dbg(priv, "Port state change event\n");
-		queue_work(ipoib_workqueue, &priv->flush_task);
+		queue_work(ipoib_workqueue, &priv->flush_light);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_normal);
 	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
-		ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
-		queue_work(ipoib_workqueue, &priv->pkey_event_task);
+		queue_work(ipoib_workqueue, &priv->flush_heavy);
 	}
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index fa288e2..25033b1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -28,8 +28,6 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * $Id: ipoib_vlan.c 1349 2004-12-16 21:09:43Z roland $
  */
 
 #include <linux/module.h>
@@ -92,7 +90,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 
 	priv->max_ib_mtu = ppriv->max_ib_mtu;
 	/* MTU will be reset when mcast join happens */
-	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->dev->mtu   = IPOIB_UD_MTU(priv->max_ib_mtu);
 	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
 	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);