Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 89877e42827f16fa5f86b1df0c2860b1 > files > 1709

kernel-2.6.18-128.1.10.el5.src.rpm

From: Doug Ledford <dledford@redhat.com>
Date: Mon, 24 Mar 2008 14:24:28 -0400
Subject: [openib] IPoIB updates
Message-id: 1206383072-7299-8-git-send-email-dledford@redhat.com
O-Subject: [Patch RHEL5 07/10] Infiniband: IPoIB updates
Bugzilla: 253023

OFED 1.3 final removed a patch from IPoIB that was in rc1.  That accounts
for most of the removals in this patch.  They also fixed some UDP
performance issues, and enabled support for 4k IPoIB mtus and
for non-SRQ connected mode operation (required for ehca hardware),
which is the remainder of the patch.

Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 2ff59a1..e6c5857 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -56,19 +56,20 @@
 /* constants */
 
 enum {
-	IPOIB_PACKET_SIZE         = 2048,
-	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
-
 	IPOIB_ENCAP_LEN 	  = 4,
 
+ 	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+ 	IPOIB_UD_RX_SG		  = 2, /* for 4K MTU */
+
 	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header to 16 */
 	IPOIB_CM_BUF_SIZE         = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
 	IPOIB_CM_HEAD_SIZE 	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
 	IPOIB_CM_RX_SG            = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
-	IPOIB_RX_RING_SIZE 	  = 128,
-	IPOIB_TX_RING_SIZE 	  = 64,
+	IPOIB_RX_RING_SIZE 	  = 256,
+	IPOIB_TX_RING_SIZE 	  = 128,
 	IPOIB_MAX_QUEUE_SIZE	  = 8192,
 	IPOIB_MIN_QUEUE_SIZE	  = 2,
+	IPOIB_CM_MAX_CONN_QP	  = 4096,
 
 	IPOIB_NUM_WC 		  = 4,
 
@@ -86,8 +87,8 @@ enum {
 	IPOIB_MCAST_STARTED       = 8,
 	IPOIB_FLAG_ADMIN_CM 	  = 9,
 	IPOIB_FLAG_UMCAST	  = 10,
-	IPOIB_FLAG_HW_CSUM        = 11,
-	IPOIB_FLAG_RX_CSUM        = 12,
+	IPOIB_FLAG_CSUM           = 11,
+	IPOIB_FLAG_TIME_ON	  = 12,
 
 	IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -95,6 +96,12 @@ enum {
 	IPOIB_MCAST_FLAG_SENDONLY = 1,
 	IPOIB_MCAST_FLAG_BUSY 	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
+
+	MAX_SEND_CQE              = 16,
+	UD_POST_RCV_COUNT         = 16,
+	CM_POST_SRQ_COUNT         = 16,
+
+	SKB_TSHOLD		  = 256,
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -106,11 +113,14 @@ enum {
 
 /* structs */
 
-#define IPOIB_HEADER_F_HWCSUM 0x1
+struct ipoib_cm_tx_buf {
+	struct sk_buff *skb;
+	u64		mapping;
+};
 
 struct ipoib_header {
 	__be16	proto;
-	__be16	flags;
+	u16	reserved;
 };
 
 struct ipoib_pseudoheader {
@@ -139,9 +149,9 @@ struct ipoib_mcast {
 	struct net_device *dev;
 };
 
-struct ipoib_rx_buf {
+struct ipoib_sg_rx_buf {
 	struct sk_buff *skb;
-	u64		mapping;
+	u64		mapping[IPOIB_UD_RX_SG];
 };
 
 struct ipoib_tx_buf {
@@ -155,21 +165,20 @@ static inline int ipoib_dma_map_tx(struct ib_device *ca,
 	struct sk_buff *skb = tx_req->skb;
 	u64 *mapping = tx_req->mapping;
 	int i;
-	int frags;
+	int nfrags;
 	int off;
 
-	if (!skb_is_gso(skb)) {
+	if (skb_headlen(skb)) {
 		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
 					       DMA_TO_DEVICE);
 		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
 			return -EIO;
 		off = 1;
-	}
-	else
+	} else
 		off = 0;
 
-	frags = skb_shinfo(skb)->nr_frags;
-	for (i = 0; i < frags; ++i) {
+	nfrags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < nfrags; ++i) {
 		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 		mapping[i + off] = ib_dma_map_page(ca, frag->page, frag->page_offset,
 						   frag->size, DMA_TO_DEVICE);
@@ -179,11 +188,10 @@ static inline int ipoib_dma_map_tx(struct ib_device *ca,
 	return 0;
 
 partial_error:
-	if (!skb_is_gso(skb)) {
+	if (skb_headlen(skb)) {
 		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
 		off = 0;
-	}
-	else
+	} else
 		off = 1;
 
 	for (; i > 0; --i) {
@@ -200,17 +208,17 @@ static inline void ipoib_dma_unmap_tx(struct ib_device *ca,
 	struct sk_buff *skb = tx_req->skb;
 	u64 *mapping = tx_req->mapping;
 	int i;
-	int frags;
+	int nfrags;
 	int off;
 
-	if (!skb_is_gso(skb)) {
+	if (skb_headlen(skb)) {
 		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
 		off = 1;
 	} else
 		off = 0;
 
-	frags = skb_shinfo(skb)->nr_frags;
-	for (i = 0; i < frags; ++i) {
+	nfrags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < nfrags; ++i) {
 		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 		ib_dma_unmap_page(ca, mapping[i + off], frag->size,
 				  DMA_TO_DEVICE);
@@ -260,10 +268,19 @@ enum ipoib_cm_state {
 struct ipoib_cm_rx {
 	struct ib_cm_id     *id;
 	struct ib_qp        *qp;
+	struct ipoib_cm_rx_buf *rx_ring;
 	struct list_head     list;
 	struct net_device   *dev;
 	unsigned long        jiffies;
 	enum ipoib_cm_state  state;
+	int		     index;
+	int		     recv_count;
+};
+
+struct ipoib_vmap {
+	void	       *ptr;
+	struct page   **page_arr;
+	int		npages;
 };
 
 struct ipoib_cm_tx {
@@ -273,12 +290,12 @@ struct ipoib_cm_tx {
 	struct net_device   *dev;
 	struct ipoib_neigh  *neigh;
 	struct ipoib_path   *path;
-	struct ipoib_tx_buf *tx_ring;
+	struct ipoib_vmap    tx_vmap_ring;
+	struct ipoib_cm_tx_buf *tx_ring;
 	unsigned             tx_head;
 	unsigned             tx_tail;
 	unsigned long        flags;
 	u32                  mtu;
-	struct ib_wc         ibwc[IPOIB_NUM_WC];
 };
 
 struct ipoib_cm_rx_buf {
@@ -286,8 +303,14 @@ struct ipoib_cm_rx_buf {
 	u64 mapping[IPOIB_CM_RX_SG];
 };
 
+struct ipoib_cm_rx_wr {
+	struct ib_recv_wr	wr;
+	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
+};
+
 struct ipoib_cm_dev_priv {
 	struct ib_srq  	       *srq;
+	struct ipoib_vmap 	rx_vmap_srq_ring;
 	struct ipoib_cm_rx_buf *srq_ring;
 	struct ib_cm_id        *id;
 	struct list_head        passive_ids;   /* state: LIVE */
@@ -306,6 +329,14 @@ struct ipoib_cm_dev_priv {
 	struct ib_wc            ibwc[IPOIB_NUM_WC];
 	struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
 	struct ib_recv_wr       rx_wr;
+	int			nonsrq_conn_qp;
+	int			max_cm_mtu;
+	int			num_frags;
+	struct ipoib_cm_rx_wr  *head;
+	struct ipoib_cm_rx_wr  *tail;
+	struct ipoib_vmap 	rx_vmap_wr_arr;
+	struct ipoib_cm_rx_wr  *rx_wr_arr;
+	int			rx_skipped;
 };
 
 struct ipoib_ethtool_st {
@@ -322,7 +353,10 @@ struct ipoib_ethtool_st {
 struct ipoib_dev_priv {
 	spinlock_t lock;
 
-	struct net_device *dev;
+	struct net_device      *dev;
+	struct ib_recv_wr	rx_wr_draft[UD_POST_RCV_COUNT];
+	struct ib_sge 		sglist_draft[UD_POST_RCV_COUNT][IPOIB_UD_RX_SG];
+	unsigned int		rx_outst;
 
 	unsigned long flags;
 
@@ -349,7 +383,8 @@ struct ipoib_dev_priv {
 	u16               pkey_index;
 	struct ib_pd  	 *pd;
 	struct ib_mr  	 *mr;
-	struct ib_cq  	 *cq;
+	struct ib_cq  	 *rcq;
+	struct ib_cq  	 *scq;
 	struct ib_qp  	 *qp;
 	u32           	  qkey;
 
@@ -359,9 +394,11 @@ struct ipoib_dev_priv {
 	unsigned int admin_mtu;
 	unsigned int mcast_mtu;
 
-	struct ipoib_rx_buf *rx_ring;
+	struct ipoib_vmap	rx_vmap_ring;
+	struct ipoib_sg_rx_buf *rx_ring;
 
 	spinlock_t           tx_lock;
+	struct ipoib_vmap    tx_vmap_ring;
 	struct ipoib_tx_buf *tx_ring;
 	unsigned             tx_head;
 	unsigned             tx_tail;
@@ -369,7 +406,9 @@ struct ipoib_dev_priv {
 	struct ib_send_wr    tx_wr;
 	unsigned             tx_outstanding;
 
-	struct ib_wc ibwc[IPOIB_NUM_WC];
+	struct ib_wc 	     ibwc[IPOIB_NUM_WC];
+	struct ib_wc         send_wc[MAX_SEND_CQE];
+	unsigned int	     tx_poll;
 
 	struct list_head dead_ahs;
 
@@ -391,6 +430,9 @@ struct ipoib_dev_priv {
 	struct dentry *path_dentry;
 #endif
 	struct ipoib_ethtool_st etool;
+	struct timer_list poll_timer;
+	struct ib_ah *own_ah;
+ 	int max_ib_mtu;
 };
 
 struct ipoib_ah {
@@ -431,6 +473,22 @@ struct ipoib_neigh {
 	struct list_head    list;
 };
 
+#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
+static inline int ipoib_ud_need_sg(int ib_mtu)
+{
+	return (IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE) ? 1 : 0;
+}
+static inline void ipoib_sg_dma_unmap_rx(struct ipoib_dev_priv *priv,
+					 u64 mapping[IPOIB_UD_RX_SG])
+{
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, DMA_FROM_DEVICE);
+		ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, DMA_FROM_DEVICE);
+	} else
+		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), DMA_FROM_DEVICE);
+}
+
 /*
  * We stash a pointer to our private neighbour information after our
  * hardware address in neigh->ha.  The ALIGN() expression here makes
@@ -452,7 +510,7 @@ extern struct workqueue_struct *ipoib_workqueue;
 /* functions */
 
 int ipoib_poll(struct net_device *dev, int *budget);
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_ib_rx_completion(struct ib_cq *cq, void *dev_ptr);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
 				 struct ib_pd *pd, struct ib_ah_attr *attr);
@@ -531,9 +589,10 @@ void ipoib_pkey_poll(struct work_struct *work);
 int ipoib_pkey_dev_delay_open(struct net_device *dev);
 void ipoib_drain_cq(struct net_device *dev);
 
-#define IPOIB_FLAGS_HWCSUM      0x01
-
 void ipoib_set_ethtool_ops(struct net_device *dev);
+void destroy_own_ah(struct ipoib_dev_priv *priv);
+int ipoib_vmalloc(struct ipoib_vmap *buf, int size);
+void ipoib_vfree(struct ipoib_vmap *buf);
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 
@@ -543,6 +602,8 @@ void ipoib_set_ethtool_ops(struct net_device *dev);
 /* We don't support UC connections at the moment */
 #define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
 
+extern int ipoib_max_conn_qp;
+
 static inline int ipoib_cm_admin_enabled(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -573,6 +634,18 @@ static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *t
 	neigh->cm = tx;
 }
 
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return priv->cm.max_cm_mtu;
+}
+
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return !!priv->cm.srq;
+}
+
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
 int ipoib_cm_dev_open(struct net_device *dev);
 void ipoib_cm_dev_stop(struct net_device *dev);
@@ -590,6 +663,8 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
 
 struct ipoib_cm_tx;
 
+#define ipoib_max_conn_qp 0
+
 static inline int ipoib_cm_admin_enabled(struct net_device *dev)
 {
 	return 0;
@@ -615,6 +690,16 @@ static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *t
 {
 }
 
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	return 0;
+}
+
 static inline
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
 {
@@ -691,7 +776,6 @@ static inline int ipoib_register_debugfs(void) { return 0; }
 static inline void ipoib_unregister_debugfs(void) { }
 #endif
 
-
 #define ipoib_printk(level, priv, format, arg...)	\
 	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
 #define ipoib_warn(priv, format, arg...)		\
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 7ec9b65..7c34ac0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -39,6 +39,18 @@
 #include <linux/icmpv6.h>
 #include <linux/delay.h>
 
+int ipoib_max_conn_qp = 128;
+
+module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
+MODULE_PARM_DESC(max_nonsrq_conn_qp,
+		 "Max number of connected-mode QPs per interface "
+		 "(applied only if shared receive queue is not available)");
+
+static int ipoib_set_nonsrq = 0;
+module_param_named(set_nonsrq, ipoib_set_nonsrq, int, 0644);
+MODULE_PARM_DESC(set_nonsrq, "set to dictate working in none SRQ mode"
+		 ", otherwise act according to device capabilities");
+
 #define to_net_dev(class) container_of(class, struct net_device, class_dev)
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
@@ -83,7 +95,56 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
 		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
 }
 
-static int ipoib_cm_post_receive(struct net_device *dev, int id)
+static int ipoib_cm_post_receive_srq(struct net_device *dev, int id, int pi)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int i, ret = 0;
+	struct ipoib_cm_rx_wr *cur;
+	int post;
+
+	ipoib_dbg_data(priv, "posting to id=%d, pi=%d\n", id, pi);
+	cur = &priv->cm.rx_wr_arr[id];
+	if (!priv->cm.head)
+		priv->cm.head = cur;
+
+	if (priv->cm.tail)
+		priv->cm.tail->wr.next = &cur->wr;
+
+	cur->wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+	cur->wr.next = NULL;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		cur->rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
+
+	post = pi || (priv->cm.rx_skipped >= CM_POST_SRQ_COUNT);
+	if (post) {
+		ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.head->wr, &bad_wr);
+		if (unlikely(ret)) {
+			ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+			while (bad_wr) {
+				id = bad_wr->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
+				ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
+						      priv->cm.srq_ring[id].mapping);
+				dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
+				priv->cm.srq_ring[id].skb = NULL;
+				bad_wr = bad_wr->next;
+			}
+		} else {
+			priv->cm.rx_skipped = 0;
+			priv->cm.head = NULL;
+			priv->cm.tail = NULL;
+		}
+	} else {
+		++priv->cm.rx_skipped;
+		priv->cm.tail = cur;
+	}
+
+	return ret;
+}
+
+static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
+					struct ipoib_cm_rx *rx, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_recv_wr *bad_wr;
@@ -92,21 +153,23 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id)
 	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
-		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
+		priv->cm.rx_sge[i].addr = rx->rx_ring[id].mapping[i];
 
-	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
+	ret = ib_post_recv(rx->qp, &priv->cm.rx_wr, &bad_wr);
 	if (unlikely(ret)) {
-		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
 		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
-				      priv->cm.srq_ring[id].mapping);
-		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
-		priv->cm.srq_ring[id].skb = NULL;
+				      rx->rx_ring[id].mapping);
+		dev_kfree_skb_any(rx->rx_ring[id].skb);
+		rx->rx_ring[id].skb = NULL;
 	}
 
 	return ret;
 }
 
-static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags,
+static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
+					     struct ipoib_cm_rx_buf *rx_ring,
+					     int id, int frags,
 					     u64 mapping[IPOIB_CM_RX_SG])
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -143,7 +206,7 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int
 			goto partial_error;
 	}
 
-	priv->cm.srq_ring[id].skb = skb;
+	rx_ring[id].skb = skb;
 	return skb;
 
 partial_error:
@@ -157,7 +220,23 @@ partial_error:
 	return NULL;
 }
 
-static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
+static void ipoib_cm_free_rx_ring(struct net_device *dev,
+				  struct ipoib_cm_rx_buf *rx_ring)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (rx_ring[i].skb) {
+			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+					      rx_ring[i].mapping);
+			dev_kfree_skb_any(rx_ring[i].skb);
+		}
+
+	ipoib_vfree(&priv->cm.rx_vmap_srq_ring);
+}
+
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
 {
 	struct ib_send_wr *bad_wr;
 	struct ipoib_cm_rx *p;
@@ -201,8 +280,8 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
 		.event_handler = ipoib_cm_rx_event_handler,
-		.send_cq = priv->cq, /* For drain WR */
-		.recv_cq = priv->cq,
+		.send_cq = priv->rcq, /* For drain WR */
+		.recv_cq = priv->rcq,
 		.srq = priv->cm.srq,
 		.cap.max_send_wr = 1, /* For drain WR */
 		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
@@ -210,12 +289,18 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
 		.qp_type = IB_QPT_RC,
 		.qp_context = p,
 	};
+
+	if (!ipoib_cm_has_srq(dev)) {
+		attr.cap.max_recv_wr  = ipoib_recvq_size;
+		attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
+	}
+
 	return ib_create_qp(priv->pd, &attr);
 }
 
 static int ipoib_cm_modify_rx_qp(struct net_device *dev,
-				  struct ib_cm_id *cm_id, struct ib_qp *qp,
-				  unsigned psn)
+				 struct ib_cm_id *cm_id, struct ib_qp *qp,
+				 unsigned psn)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_attr qp_attr;
@@ -268,6 +353,60 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
 	return 0;
 }
 
+static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
+				   struct ipoib_cm_rx *rx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+	int i;
+
+	rx->rx_ring = kcalloc(ipoib_recvq_size, sizeof *rx->rx_ring, GFP_KERNEL);
+	if (!rx->rx_ring)
+		return -ENOMEM;
+
+	spin_lock_irq(&priv->lock);
+
+	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
+		spin_unlock_irq(&priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
+		ret = -EINVAL;
+		goto err_free;
+	} else
+		++priv->cm.nonsrq_conn_qp;
+
+	spin_unlock_irq(&priv->lock);
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
+					   rx->rx_ring[i].mapping)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+				ret = -ENOMEM;
+				goto err_count;
+			}
+		ret = ipoib_cm_post_receive_nonsrq(dev, rx, i);
+		if (ret) {
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
+				   "failed for buf %d\n", i);
+			ret = -EIO;
+			goto err_count;
+		}
+	}
+
+	rx->recv_count = ipoib_recvq_size;
+
+	return 0;
+
+err_count:
+	spin_lock_irq(&priv->lock);
+	--priv->cm.nonsrq_conn_qp;
+	spin_unlock_irq(&priv->lock);
+
+err_free:
+	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
+
+	return ret;
+}
+
 static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
 			     struct ib_qp *qp, struct ib_cm_req_event_param *req,
 			     unsigned psn)
@@ -283,7 +422,7 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
 	rep.private_data_len = sizeof data;
 	rep.flow_control = 0;
 	rep.rnr_retry_count = req->rnr_retry_count;
-	rep.srq = 1;
+	rep.srq = ipoib_cm_has_srq(dev);
 	rep.qp_num = qp->qp_num;
 	rep.starting_psn = psn;
 	return ib_send_cm_rep(cm_id, &rep);
@@ -319,6 +458,12 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 	if (ret)
 		goto err_modify;
 
+	if (!ipoib_cm_has_srq(dev)) {
+		ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
+		if (ret)
+			goto err_modify;
+	}
+
 	spin_lock_irq(&priv->lock);
 	queue_delayed_work(ipoib_workqueue,
 			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
@@ -403,13 +548,15 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx_buf *rx_ring;
 	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
 	struct sk_buff *skb, *newskb;
 	struct ipoib_cm_rx *p;
 	unsigned long flags;
 	u64 mapping[IPOIB_CM_RX_SG];
 	int frags;
-	struct ipoib_header *header;
+	int has_srq;
+	struct sk_buff *small_skb;
 
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -427,18 +574,32 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		return;
 	}
 
-	skb  = priv->cm.srq_ring[wr_id].skb;
+	p = wc->qp->qp_context;
+
+	has_srq = ipoib_cm_has_srq(dev);
+	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
+
+	skb = rx_ring[wr_id].skb;
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		ipoib_dbg(priv, "cm recv error "
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
 		++priv->stats.rx_dropped;
-		goto repost;
+		if (has_srq)
+			goto repost;
+		else {
+			if (!--p->recv_count) {
+				spin_lock_irqsave(&priv->lock, flags);
+				list_move(&p->list, &priv->cm.rx_reap_list);
+				spin_unlock_irqrestore(&priv->lock, flags);
+				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+			}
+			return;
+		}
 	}
 
 	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
-		p = wc->qp->qp_context;
 		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
 			spin_lock_irqsave(&priv->lock, flags);
 			p->jiffies = jiffies;
@@ -450,10 +611,24 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		}
 	}
 
+	if (wc->byte_len < SKB_TSHOLD) {
+		int dlen = wc->byte_len - IPOIB_ENCAP_LEN;
+
+		small_skb = dev_alloc_skb(dlen);
+		if (small_skb) {
+			small_skb->protocol = ((struct ipoib_header *)skb->data)->proto;
+			skb_copy_from_linear_data_offset(skb, IPOIB_ENCAP_LEN,
+							 small_skb->data, dlen);
+			skb_put(small_skb, dlen);
+			skb = small_skb;
+			goto copied;
+		}
+	}
+
 	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
 					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
 
-	newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping);
+	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping);
 	if (unlikely(!newskb)) {
 		/*
 		 * If we can't allocate a new RX buffer, dump
@@ -464,21 +639,19 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		goto repost;
 	}
 
-	ipoib_cm_dma_unmap_rx(priv, frags, priv->cm.srq_ring[wr_id].mapping);
-	memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
+	ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
+	memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
 
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
 	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
 
-	header = (struct ipoib_header *)skb->data;
-	skb->protocol = header->proto;
-	if (header->flags & cpu_to_be16(IPOIB_HEADER_F_HWCSUM))
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
 	skb_reset_mac_header(skb);
 	skb_pull(skb, IPOIB_ENCAP_LEN);
 
+copied:
 	dev->last_rx = jiffies;
 	++priv->stats.rx_packets;
 	priv->stats.rx_bytes += skb->len;
@@ -489,29 +662,30 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	netif_receive_skb(skb);
 
 repost:
-	if (unlikely(ipoib_cm_post_receive(dev, wr_id)))
-		ipoib_warn(priv, "ipoib_cm_post_receive failed "
-			   "for buf %d\n", wr_id);
+	if (has_srq) {
+		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id, 0)))
+			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
+				   "for buf %d\n", wr_id);
+	} else {
+		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) {
+			--p->recv_count;
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
+				   "for buf %d\n", wr_id);
+		}
+	}
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,
 			    struct ipoib_cm_tx *tx,
 			    unsigned int wr_id,
-			    u64 *mapping, int headlen,
-			    skb_frag_t *frags,
-			    int nr_frags)
+			    u64 addr, int len)
 {
 	struct ib_send_wr *bad_wr;
-	int i;
 
-	priv->tx_sge[0].addr   = mapping[0];
-	priv->tx_sge[0].length = headlen;
-	for (i = 0; i < nr_frags; ++i) {
-		priv->tx_sge[i + 1].addr = mapping[i + 1];
-		priv->tx_sge[i + 1].length = frags[i].size;
-	}
-	priv->tx_wr.num_sge    = nr_frags + 1;
-	priv->tx_wr.wr_id      = wr_id | IPOIB_OP_CM;
+	priv->tx_sge[0].addr          = addr;
+	priv->tx_sge[0].length        = len;
+
+	priv->tx_wr.wr_id 	      = wr_id | IPOIB_OP_CM;
 
 	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
@@ -519,7 +693,8 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
+	u64 addr;
 
 	if (unlikely(skb->len > tx->mtu)) {
 		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
@@ -542,20 +717,20 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 	 */
 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->skb = skb;
-	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
+	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		++priv->stats.tx_errors;
 		dev_kfree_skb_any(skb);
 		return;
 	}
 
-	if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
-			       tx_req->mapping, skb_headlen(skb),
-			       skb_shinfo(skb)->frags,
-			       skb_shinfo(skb)->nr_frags))) {
+	tx_req->mapping = addr;
 
+	if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
+			       addr, skb->len))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++priv->stats.tx_errors;
-		ipoib_dma_unmap_tx(priv->ca, tx_req);
+		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
 		dev_kfree_skb_any(skb);
 	} else {
 		dev->trans_start = jiffies;
@@ -574,7 +749,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_tx *tx = wc->qp->qp_context;
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
 	unsigned long flags;
 
 	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
@@ -588,7 +763,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	tx_req = &tx->tx_ring[wr_id];
 
-	ipoib_dma_unmap_tx(priv->ca, tx_req);
+	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
 
 	/* FIXME: is this right? Shouldn't we only increment on success? */
 	++priv->stats.tx_packets;
@@ -669,10 +844,33 @@ err_cm:
 	return ret;
 }
 
+static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx *rx, *n;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&priv->lock);
+	list_splice_init(&priv->cm.rx_reap_list, &list);
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(rx, n, &list, list) {
+		ib_destroy_cm_id(rx->id);
+		ib_destroy_qp(rx->qp);
+		if (!ipoib_cm_has_srq(dev)) {
+			ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
+			spin_lock_irq(&priv->lock);
+			--priv->cm.nonsrq_conn_qp;
+			spin_unlock_irq(&priv->lock);
+		}
+		kfree(rx);
+	}
+}
+
 void ipoib_cm_dev_stop(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_cm_rx *p, *n;
+	struct ipoib_cm_rx *p;
 	unsigned long begin;
 	LIST_HEAD(list);
 	int ret;
@@ -707,9 +905,9 @@ void ipoib_cm_dev_stop(struct net_device *dev)
 			/*
 			 * assume the HW is wedged and just free up everything.
 			 */
-			list_splice_init(&priv->cm.rx_flush_list, &list);
-			list_splice_init(&priv->cm.rx_error_list, &list);
-			list_splice_init(&priv->cm.rx_drain_list, &list);
+			list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_error_list, &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
 			break;
 		}
 		spin_unlock_irq(&priv->lock);
@@ -718,15 +916,9 @@ void ipoib_cm_dev_stop(struct net_device *dev)
 		spin_lock_irq(&priv->lock);
 	}
 
-	list_splice_init(&priv->cm.rx_reap_list, &list);
-
 	spin_unlock_irq(&priv->lock);
 
-	list_for_each_entry_safe(p, n, &list, list) {
-		ib_destroy_cm_id(p->id);
-		ib_destroy_qp(p->qp);
-		kfree(p);
-	}
+	ipoib_cm_free_rx_reap_list(dev);
 
 	cancel_delayed_work(&priv->cm.stale_task);
 }
@@ -803,16 +995,15 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
-		.send_cq		= priv->cq,
-		.recv_cq		= priv->cq,
+		.send_cq		= priv->rcq,
+		.recv_cq		= priv->rcq,
 		.srq			= priv->cm.srq,
 		.cap.max_send_wr	= ipoib_sendq_size,
-		.cap.max_send_sge	= dev->features & NETIF_F_SG ?
-							MAX_SKB_FRAGS + 1 : 1,
+		.cap.max_send_sge	= 1,
 		.sq_sig_type		= IB_SIGNAL_ALL_WR,
 		.qp_type		= IB_QPT_RC,
 		.qp_context		= tx
-        };
+	};
 
 	return ib_create_qp(priv->pd, &attr);
 }
@@ -850,7 +1041,7 @@ static int ipoib_cm_send_req(struct net_device *dev,
 	req.retry_count 	      = 0; /* RFC draft warns against retries */
 	req.rnr_retry_count 	      = 0; /* RFC draft warns against retries */
 	req.max_cm_retries 	      = 15;
-	req.srq 	              = 1;
+	req.srq 	              = ipoib_cm_has_srq(dev);
 	return ib_send_cm_req(id, &req);
 }
 
@@ -885,13 +1076,11 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
 	int ret;
 
-	p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring,
-				GFP_KERNEL);
-	if (!p->tx_ring) {
-		ipoib_warn(priv, "failed to allocate tx ring\n");
+	if (ipoib_vmalloc(&p->tx_vmap_ring, ipoib_sendq_size * sizeof *p->tx_ring)) {
 		ret = -ENOMEM;
 		goto err_tx;
 	}
+	p->tx_ring = p->tx_vmap_ring.ptr;
 
 	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
 	if (IS_ERR(p->qp)) {
@@ -939,7 +1128,7 @@ err_tx:
 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
-	struct ipoib_tx_buf *tx_req;
+	struct ipoib_cm_tx_buf *tx_req;
 	unsigned long flags;
 	unsigned long begin;
 
@@ -967,7 +1156,8 @@ timeout:
 
 	while ((int) p->tx_tail - (int) p->tx_head < 0) {
 		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
-		ipoib_dma_unmap_tx(priv->ca, tx_req);
+		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
+				    DMA_TO_DEVICE);
 		dev_kfree_skb_any(tx_req->skb);
 		++p->tx_tail;
 		spin_lock_irqsave(&priv->tx_lock, flags);
@@ -981,7 +1171,7 @@ timeout:
 	if (p->qp)
 		ib_destroy_qp(p->qp);
 
-	kfree(p->tx_ring);
+	ipoib_vfree(&p->tx_vmap_ring);
 	kfree(p);
 }
 
@@ -1162,7 +1352,7 @@ static void ipoib_cm_skb_reap(struct work_struct *work)
 	spin_unlock_irq(&priv->tx_lock);
 }
 
-void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 			   unsigned int mtu)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1178,20 +1368,8 @@ void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
 
 static void ipoib_cm_rx_reap(struct work_struct *work)
 {
-	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
-						   cm.rx_reap_task);
-	struct ipoib_cm_rx *p, *n;
-	LIST_HEAD(list);
-
-	spin_lock_irq(&priv->lock);
-	list_splice_init(&priv->cm.rx_reap_list, &list);
-	spin_unlock_irq(&priv->lock);
-
-	list_for_each_entry_safe(p, n, &list, list) {
-		ib_destroy_cm_id(p->id);
-		ib_destroy_qp(p->qp);
-		kfree(p);
-	}
+	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
+						cm.rx_reap_task)->dev);
 }
 
 static void ipoib_cm_stale_task(struct work_struct *work)
@@ -1244,11 +1422,9 @@ static ssize_t set_mode(struct class_device *d, const char *buf, size_t count)
 		ipoib_warn(priv, "enabling connected mode "
 			   "will cause multicast packet drops\n");
 
-		/* clear ipv6 flag too */
-		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO);
 
-		priv->tx_wr.send_flags &=
-			~(IB_SEND_UDP_TCP_CSUM | IB_SEND_IP_CSUM);
+		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 
 		ipoib_flush_paths(dev);
 		return count;
@@ -1260,11 +1436,10 @@ static ssize_t set_mode(struct class_device *d, const char *buf, size_t count)
 		ipoib_flush_paths(dev);
 
 		if (priv->ca->flags & IB_DEVICE_IP_CSUM)
-			dev->features |= NETIF_F_IP_CSUM; /* ipv6 too */
+			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
 
 
-		if (!test_bit(IPOIB_FLAG_HW_CSUM, &priv->flags) &&
-		    priv->dev->features & NETIF_F_SG &&
+		if (priv->dev->features & NETIF_F_SG &&
 		    priv->ca->flags & IB_DEVICE_TCP_TSO)
 			priv->dev->features |= NETIF_F_TSO;
 
@@ -1281,16 +1456,58 @@ int ipoib_cm_add_mode_attr(struct net_device *dev)
 	return class_device_create_file(&dev->class_dev, &class_device_attr_mode);
 }
 
-int ipoib_cm_dev_init(struct net_device *dev)
+static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_srq_init_attr srq_init_attr = {
 		.attr = {
 			.max_wr  = ipoib_recvq_size,
-			.max_sge = IPOIB_CM_RX_SG
+			.max_sge = max_sge
 		}
 	};
-	int ret, i;
+
+	if (ipoib_set_nonsrq)
+		return;
+
+	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+	if (IS_ERR(priv->cm.srq)) {
+		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
+			printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
+			       priv->ca->name, PTR_ERR(priv->cm.srq));
+		priv->cm.srq = NULL;
+		return;
+	}
+
+	if (ipoib_vmalloc(&priv->cm.rx_vmap_wr_arr, ipoib_recvq_size *
+			  sizeof priv->cm.rx_wr_arr[0])) {
+		ipoib_warn(priv, "failed allocating SRQ wr array\n");
+		goto destory_srq;
+	}
+        priv->cm.rx_wr_arr = priv->cm.rx_vmap_wr_arr.ptr;
+
+	if (ipoib_vmalloc(&priv->cm.rx_vmap_srq_ring, ipoib_recvq_size *
+			  sizeof *priv->cm.srq_ring)) {
+		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
+		goto free_wr_array;
+	}
+	priv->cm.srq_ring = priv->cm.rx_vmap_srq_ring.ptr;
+
+	return;
+
+free_wr_array:
+	ipoib_vfree(&priv->cm.rx_vmap_wr_arr);
+	priv->cm.rx_wr_arr = NULL;
+destory_srq:
+	ib_destroy_srq(priv->cm.srq);
+	priv->cm.srq = NULL;
+}
+
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i, ret, j;
+	struct ib_device_attr attr;
 
 	INIT_LIST_HEAD(&priv->cm.passive_ids);
 	INIT_LIST_HEAD(&priv->cm.reap_list);
@@ -1307,43 +1524,65 @@ int ipoib_cm_dev_init(struct net_device *dev)
 
 	skb_queue_head_init(&priv->cm.skb_queue);
 
-	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
-	if (IS_ERR(priv->cm.srq)) {
-		ret = PTR_ERR(priv->cm.srq);
-		priv->cm.srq = NULL;
+	ret = ib_query_device(priv->ca, &attr);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
 		return ret;
 	}
 
-	priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
-				    GFP_KERNEL);
-	if (!priv->cm.srq_ring) {
-		printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n",
-		       priv->ca->name, ipoib_recvq_size);
-		ipoib_cm_dev_cleanup(dev);
-		return -ENOMEM;
+	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+
+	attr.max_srq_sge = min(IPOIB_CM_RX_SG, attr.max_srq_sge);
+	ipoib_cm_create_srq(dev, attr.max_srq_sge);
+	if (ipoib_cm_has_srq(dev)) {
+		priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
+		priv->cm.num_frags  = attr.max_srq_sge;
+		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
+			  priv->cm.max_cm_mtu, priv->cm.num_frags);
+	} else {
+		priv->cm.max_cm_mtu = IPOIB_CM_MTU;
+		priv->cm.num_frags  = IPOIB_CM_RX_SG;
 	}
 
-	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+	for (i = 0; i < priv->cm.num_frags; ++i)
 		priv->cm.rx_sge[i].lkey	= priv->mr->lkey;
 
+	if (ipoib_cm_has_srq(dev)) {
+		for (j = 0; j < ipoib_recvq_size; ++j) {
+			for (i = 0; i < priv->cm.num_frags; ++i)
+				priv->cm.rx_wr_arr[j].rx_sge[i].lkey = priv->mr->lkey;
+
+			priv->cm.rx_wr_arr[j].rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
+			for (i = 1; i < priv->cm.num_frags; ++i)
+				priv->cm.rx_wr_arr[j].rx_sge[i].length = PAGE_SIZE;
+
+			priv->cm.rx_wr_arr[j].wr.sg_list = priv->cm.rx_wr_arr[j].rx_sge;
+			priv->cm.rx_wr_arr[j].wr.num_sge = priv->cm.num_frags;
+		}
+        	priv->cm.head = &priv->cm.rx_wr_arr[0];
+	}
+
 	priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
-	for (i = 1; i < IPOIB_CM_RX_SG; ++i)
+	for (i = 1; i < priv->cm.num_frags; ++i)
 		priv->cm.rx_sge[i].length = PAGE_SIZE;
 	priv->cm.rx_wr.next = NULL;
 	priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
-	priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
-
-	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1,
-					   priv->cm.srq_ring[i].mapping)) {
-			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
-			ipoib_cm_dev_cleanup(dev);
-			return -ENOMEM;
-		}
-		if (ipoib_cm_post_receive(dev, i)) {
-			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
-			ipoib_cm_dev_cleanup(dev);
-			return -EIO;
+	priv->cm.rx_wr.num_sge = priv->cm.num_frags;
+
+	if (ipoib_cm_has_srq(dev)) {
+		for (i = 0; i < ipoib_recvq_size; ++i) {
+			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
+						   priv->cm.num_frags - 1,
+						   priv->cm.srq_ring[i].mapping)) {
+				ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -ENOMEM;
+			}
+			if (ipoib_cm_post_receive_srq(dev, i, 1)) {
+				ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -EIO;
+			}
 		}
 	}
 
@@ -1354,7 +1593,7 @@ int ipoib_cm_dev_init(struct net_device *dev)
 void ipoib_cm_dev_cleanup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	int i, ret;
+	int ret;
 
 	if (!priv->cm.srq)
 		return;
@@ -1368,13 +1607,9 @@ void ipoib_cm_dev_cleanup(struct net_device *dev)
 	priv->cm.srq = NULL;
 	if (!priv->cm.srq_ring)
 		return;
-	for (i = 0; i < ipoib_recvq_size; ++i)
-		if (priv->cm.srq_ring[i].skb) {
-			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
-					      priv->cm.srq_ring[i].mapping);
-			dev_kfree_skb_any(priv->cm.srq_ring[i].skb);
-			priv->cm.srq_ring[i].skb = NULL;
-		}
-	kfree(priv->cm.srq_ring);
+
+	ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
 	priv->cm.srq_ring = NULL;
+
+	ipoib_vfree(&priv->cm.rx_vmap_wr_arr);
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_etool.c b/drivers/infiniband/ulp/ipoib/ipoib_etool.c
index 6fbfee1..4f9211b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_etool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_etool.c
@@ -52,7 +52,7 @@ static int ipoib_get_coalesce(struct net_device *dev,
 	coal->rx_coalesce_usecs = priv->etool.coalesce_usecs;
 	coal->tx_coalesce_usecs = priv->etool.coalesce_usecs;
 	coal->rx_max_coalesced_frames = priv->etool.max_coalesced_frames;
-	coal->rx_max_coalesced_frames = priv->etool.max_coalesced_frames;
+	coal->tx_max_coalesced_frames = priv->etool.max_coalesced_frames;
 
 	return 0;
 }
@@ -69,7 +69,7 @@ static int ipoib_set_coalesce(struct net_device *dev,
 	    coal->tx_max_coalesced_frames > 0xffff)
 		return -EINVAL;
 
-	ret = ib_modify_cq(priv->cq, coal->rx_max_coalesced_frames,
+	ret = ib_modify_cq(priv->rcq, coal->rx_max_coalesced_frames,
 	coal->rx_coalesce_usecs);
 	if (ret) {
 			ipoib_dbg(priv, "failed modifying CQ\n");
@@ -78,7 +78,7 @@ static int ipoib_set_coalesce(struct net_device *dev,
 
 	coal->tx_coalesce_usecs = coal->rx_coalesce_usecs;
 	priv->etool.coalesce_usecs = coal->rx_coalesce_usecs;
-	coal->rx_max_coalesced_frames = coal->rx_max_coalesced_frames;
+	coal->tx_max_coalesced_frames = coal->rx_max_coalesced_frames;
 	priv->etool.max_coalesced_frames = coal->rx_max_coalesced_frames;
 
 	return 0;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index b78188a..cba3dfb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -90,44 +90,92 @@ void ipoib_free_ah(struct kref *kref)
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
-static int ipoib_ib_post_receive(struct net_device *dev, int id)
+static void clean_pending_receives(struct ipoib_dev_priv *priv)
 {
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ib_sge list;
-	struct ib_recv_wr param;
-	struct ib_recv_wr *bad_wr;
-	int ret;
-
-	list.addr     = priv->rx_ring[id].mapping;
-	list.length   = IPOIB_BUF_SIZE;
-	list.lkey     = priv->mr->lkey;
-
-	param.next    = NULL;
-	param.wr_id   = id | IPOIB_OP_RECV;
-	param.sg_list = &list;
-	param.num_sge = 1;
+	int i;
+	int id;
 
-	ret = ib_post_recv(priv->qp, &param, &bad_wr);
-	if (unlikely(ret)) {
-		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
-		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+	for (i = 0; i < priv->rx_outst; ++i) {
+		id = priv->rx_wr_draft[i].wr_id & ~IPOIB_OP_RECV;
+		ipoib_sg_dma_unmap_rx(priv,
+				      priv->rx_ring[id].mapping);
 		dev_kfree_skb_any(priv->rx_ring[id].skb);
 		priv->rx_ring[id].skb = NULL;
 	}
+	priv->rx_outst = 0;
+}
+
+static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, struct sk_buff *skb,
+				   unsigned int length)
+{
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+	 	unsigned int size;
+ 		skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+
+	 	/* put header into skb */
+ 		size = min(length, (unsigned)IPOIB_UD_HEAD_SIZE);
+ 		skb->tail += size;
+ 		skb->len += size;
+ 		length -= size;
+
+ 		size = min(length, (unsigned) PAGE_SIZE);
+ 		frag->size = size;
+ 		skb->data_len += size;
+ 		skb->truesize += size;
+ 		skb->len += size;
+ 		length -= size;
+	} else
+		skb_put(skb, length);
+}
+
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int ret = 0;
+	int i = priv->rx_outst;
+
+	priv->sglist_draft[i][0].addr = priv->rx_ring[id].mapping[0];
+	priv->sglist_draft[i][1].addr = priv->rx_ring[id].mapping[1];
+
+	priv->rx_wr_draft[i].wr_id = id | IPOIB_OP_RECV;
+
+	if (++priv->rx_outst == UD_POST_RCV_COUNT) {
+		ret = ib_post_recv(priv->qp, priv->rx_wr_draft, &bad_wr);
+
+		if (unlikely(ret)) {
+			ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+			while (bad_wr) {
+				id = bad_wr->wr_id & ~IPOIB_OP_RECV;
+				ipoib_sg_dma_unmap_rx(priv,
+						      priv->rx_ring[id].mapping);
+				dev_kfree_skb_any(priv->rx_ring[id].skb);
+				priv->rx_ring[id].skb = NULL;
+				bad_wr = bad_wr->next;
+			}
+		}
+		priv->rx_outst = 0;
+	}
 
 	return ret;
 }
 
-static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
+static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id,
+					   u64 mapping[IPOIB_UD_RX_SG])
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
-	u64 addr;
+	int buf_size;
+
+	if (ipoib_ud_need_sg(priv->max_ib_mtu))
+		buf_size = IPOIB_UD_HEAD_SIZE;
+	else
+		buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 
-	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
-	if (!skb)
-		return -ENOMEM;
+	skb = dev_alloc_skb(buf_size + 4);
+
+ 	if (unlikely(!skb))
+ 		return NULL;
 
 	/*
 	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
@@ -136,17 +184,32 @@ static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
 	 */
 	skb_reserve(skb, 4);
 
-	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
-				 DMA_FROM_DEVICE);
-	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
-		dev_kfree_skb_any(skb);
-		return -EIO;
+ 	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
+ 				       DMA_FROM_DEVICE);
+ 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
+ 		dev_kfree_skb_any(skb);
+ 		return NULL;
+ 	}
+
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		struct page *page = alloc_page(GFP_ATOMIC);
+	 	if (!page)
+ 			goto partial_error;
+
+	 	skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
+ 		mapping[1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page,
+ 					     0, PAGE_SIZE, DMA_FROM_DEVICE);
+	 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
+ 			goto partial_error;
 	}
 
-	priv->rx_ring[id].skb     = skb;
-	priv->rx_ring[id].mapping = addr;
+ 	priv->rx_ring[id].skb = skb;
+ 	return skb;
 
-	return 0;
+partial_error:
+	ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
+ 	dev_kfree_skb_any(skb);
+ 	return NULL;
 }
 
 static int ipoib_ib_post_receives(struct net_device *dev)
@@ -155,7 +218,7 @@ static int ipoib_ib_post_receives(struct net_device *dev)
 	int i;
 
 	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (ipoib_alloc_rx_skb(dev, i)) {
+		if (!ipoib_alloc_rx_skb(dev, i, priv->rx_ring[i].mapping)) {
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 			return -ENOMEM;
 		}
@@ -173,8 +236,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
 	struct sk_buff *skb;
-	struct ipoib_header *header;
-	u64 addr;
+	u64 mapping[IPOIB_UD_RX_SG];
 
 	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -186,48 +248,41 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	}
 
 	skb  = priv->rx_ring[wr_id].skb;
-	addr = priv->rx_ring[wr_id].mapping;
 
+	/* duplicate the code here, to omit fast path if need-sg condition check */
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		if (wc->status != IB_WC_WR_FLUSH_ERR)
 			ipoib_warn(priv, "failed recv event "
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
-		ib_dma_unmap_single(priv->ca, addr,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+		ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
 		dev_kfree_skb_any(skb);
 		priv->rx_ring[wr_id].skb = NULL;
 		return;
 	}
-
 	/*
 	 * Drop packets that this interface sent, ie multicast packets
 	 * that the HCA has replicated.
 	 */
 	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
 		goto repost;
-
 	/*
 	 * If we can't allocate a new RX buffer, dump
 	 * this packet and reuse the old buffer.
 	 */
-	if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
+	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id, mapping))) {
 		++priv->stats.rx_dropped;
 		goto repost;
 	}
-
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
-
-	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
-
-	skb_put(skb, wc->byte_len);
+	ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
+	ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
+	memcpy(priv->rx_ring[wr_id].mapping, mapping,
+	       IPOIB_UD_RX_SG * sizeof *mapping);
 	skb_pull(skb, IB_GRH_BYTES);
 
-	header = (struct ipoib_header *)skb->data;
-	skb->protocol = header->proto;
-	if (header->flags & cpu_to_be16(IPOIB_HEADER_F_HWCSUM))
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
 	skb_reset_mac_header(skb);
 	skb_pull(skb, IPOIB_ENCAP_LEN);
 
@@ -240,11 +295,14 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	skb->pkt_type = PACKET_HOST;
 
 	/* check rx csum */
-	if (test_bit(IPOIB_FLAG_RX_CSUM, &priv->flags) && likely(wc->csum_ok)) {
-		/* Note: this is a specific requirement for Mellanox
-		   HW but since it is the only HW currently supporting
-		   checksum offload I put it here */
-		if ((((struct iphdr *)(skb->data))->ihl) == 5)
+	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) {
+		/*
+		 * Note: this is a specific requirement for Mellanox
+		 * HW but since it is the only HW currently supporting
+		 * checksum offload I put it here
+		 */
+		skb_reset_network_header(skb);
+		if (ip_hdr(skb)->ihl == 5)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
@@ -256,12 +314,10 @@ repost:
 			   "for buf %d\n", wr_id);
 }
 
-static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+static void _ipoib_ib_handle_tx_wc(struct net_device *dev, int wr_id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	unsigned int wr_id = wc->wr_id;
 	struct ipoib_tx_buf *tx_req;
-	unsigned long flags;
 
 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -274,28 +330,54 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	tx_req = &priv->tx_ring[wr_id];
 
-	ipoib_dma_unmap_tx(priv->ca, tx_req);
-
-	++priv->stats.tx_packets;
-	priv->stats.tx_bytes += tx_req->skb->len;
-
-	dev_kfree_skb_any(tx_req->skb);
-
-	spin_lock_irqsave(&priv->tx_lock, flags);
+	if (tx_req->skb) {
+		ipoib_dma_unmap_tx(priv->ca, tx_req);
+		++priv->stats.tx_packets;
+		priv->stats.tx_bytes += tx_req->skb->len;
+		dev_kfree_skb_any(tx_req->skb);
+	}
 	++priv->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 	    netif_queue_stopped(dev) &&
 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		netif_wake_queue(dev);
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id;
+	int i;
+
+	i = priv->tx_poll;
+	do {
+		i &= (ipoib_sendq_size - 1);
+		_ipoib_ib_handle_tx_wc(dev, i);
+	} while (i++ != wr_id);
+	priv->tx_poll = i & (ipoib_sendq_size - 1);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS &&
+		     wc->status != IB_WC_WR_FLUSH_ERR))
 
-	if (wc->status != IB_WC_SUCCESS &&
-	    wc->status != IB_WC_WR_FLUSH_ERR)
 		ipoib_warn(priv, "failed send event "
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
 }
 
+void poll_tx(struct ipoib_dev_priv *priv)
+{
+	int n, i;
+
+	while (1) {
+		n = ib_poll_cq(priv->scq, MAX_SEND_CQE, priv->send_wc);
+		for (i = 0; i < n; ++i)
+			ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+
+		if (n < MAX_SEND_CQE)
+			break;
+	}
+}
+
 int ipoib_poll(struct net_device *dev, int *budget)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -311,7 +393,7 @@ poll_more:
 	while (max) {
 
 		t = min(IPOIB_NUM_WC, max);
-		n = ib_poll_cq(priv->cq, t, priv->ibwc);
+		n = ib_poll_cq(priv->rcq, t, priv->ibwc);
 
 		for (i = 0; i < n; i++) {
 			struct ib_wc *wc = priv->ibwc + i;
@@ -323,12 +405,8 @@ poll_more:
 					ipoib_cm_handle_rx_wc(dev, wc);
 				else
 					ipoib_ib_handle_rx_wc(dev, wc);
-			} else {
-				if (wc->wr_id & IPOIB_OP_CM)
-					ipoib_cm_handle_tx_wc(dev, wc);
-				else
-					ipoib_ib_handle_tx_wc(dev, wc);
-			}
+			} else
+                                ipoib_cm_handle_tx_wc(priv->dev, wc);
 		}
 
 		if (n != t)
@@ -337,10 +415,10 @@ poll_more:
 
 	if (max) {
 		netif_rx_complete(dev);
-		if (unlikely(ib_req_notify_cq(priv->cq,
+		if (unlikely(ib_req_notify_cq(priv->rcq,
 					      IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
-		    netif_rx_reschedule(dev, 0))
+					      netif_rx_reschedule(dev, 0))
 			goto poll_more;
 		ret = 0;
 	} else
@@ -352,29 +430,92 @@ poll_more:
 	return ret;
 }
 
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+void ipoib_ib_rx_completion(struct ib_cq *cq, void *dev_ptr)
 {
 	netif_rx_schedule(dev_ptr);
 }
 
+static inline int post_zlen_send_wr(struct ipoib_dev_priv *priv, unsigned wrid)
+{
+	struct ib_send_wr wr = {
+		.opcode = IB_WR_SEND,
+		.send_flags = IB_SEND_SIGNALED,
+		.wr_id = wrid,
+	};
+	struct ib_send_wr *bad_wr;
+
+	if (!priv->own_ah)
+		return -EBUSY;
+
+	wr.wr.ud.ah = priv->own_ah;
+	wr.wr.ud.remote_qpn = priv->qp->qp_num;
+	wr.wr.ud.remote_qkey = priv->qkey;
+	return ib_post_send(priv->qp, &wr, &bad_wr);
+}
+
+static void ipoib_ib_tx_timer_func(unsigned long dev_ptr)
+{
+	struct net_device *dev = (struct net_device *)dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long flags;
+	unsigned int wrid;
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	if (((int)priv->tx_tail - (int)priv->tx_head < 0) &&
+		time_after(jiffies, dev->trans_start + 10) &&
+		priv->tx_outstanding < ipoib_sendq_size &&
+		priv->own_ah) {
+		wrid = priv->tx_head & (ipoib_sendq_size - 1);
+		priv->tx_ring[wrid].skb = NULL;
+		if (post_zlen_send_wr(priv, wrid))
+			ipoib_warn(priv, "failed to post zlen send\n");
+		else {
+			++priv->tx_head;
+			++priv->tx_outstanding;
+		}
+	}
+	poll_tx(priv);
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	mod_timer(&priv->poll_timer, jiffies + HZ / 2);
+}
+
+static void flush_tx_queue(struct ipoib_dev_priv *priv)
+{
+	unsigned long flags;
+	unsigned int wrid;
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	wrid = priv->tx_head & (ipoib_sendq_size - 1);
+	priv->tx_ring[wrid].skb = NULL;
+	if (!post_zlen_send_wr(priv, wrid)) {
+		++priv->tx_head;
+		++priv->tx_outstanding;
+	} else
+		ipoib_warn(priv, "post_zlen failed\n");
+
+	poll_tx(priv);
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
 static inline int post_send(struct ipoib_dev_priv *priv,
 			    unsigned int wr_id,
 			    struct ib_ah *address, u32 qpn,
-			    u64 *mapping, int headlen,
-			    struct skb_shared_info *shinfo,
-			    void *lso_header)
+			    struct ipoib_tx_buf *tx_req,
+			    void *head, int hlen)
 {
 	struct ib_send_wr *bad_wr;
 	int i, off;
-	skb_frag_t *frags = shinfo->frags;
-	int nr_frags = shinfo->nr_frags;
+	struct sk_buff *skb = tx_req->skb;
+	skb_frag_t *frags = skb_shinfo(skb)->frags;
+	int nr_frags = skb_shinfo(skb)->nr_frags;
+	u64 *mapping = tx_req->mapping;
 
-	if (!lso_header) {
+	if (skb_headlen(skb)) {
 		priv->tx_sge[0].addr         = mapping[0];
-		priv->tx_sge[0].length       = headlen;
+		priv->tx_sge[0].length       = skb_headlen(skb);
 		off = 1;
-	}
-	else
+	} else
 		off = 0;
 
 	for (i = 0; i < nr_frags; ++i) {
@@ -386,14 +527,19 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 	priv->tx_wr.wr.ud.remote_qpn = qpn;
 	priv->tx_wr.wr.ud.ah 	     = address;
 
-	if (lso_header) {
-		priv->tx_wr.wr.ud.mss = shinfo->gso_size;
-		priv->tx_wr.wr.ud.header = lso_header;
-		priv->tx_wr.wr.ud.hlen = headlen;
+	if (head) {
+		priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size;
+		priv->tx_wr.wr.ud.header = head;
+		priv->tx_wr.wr.ud.hlen = hlen;
 		priv->tx_wr.opcode      = IB_WR_LSO;
 	} else
 		priv->tx_wr.opcode      = IB_WR_SEND;
 
+	if (unlikely((priv->tx_head & (MAX_SEND_CQE - 1)) == MAX_SEND_CQE - 1))
+		priv->tx_wr.send_flags |= IB_SEND_SIGNALED;
+	else
+		priv->tx_wr.send_flags &= ~IB_SEND_SIGNALED;
+
 	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
 }
 
@@ -402,6 +548,8 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_tx_buf *tx_req;
+	int hlen;
+	void *phead;
 
 	if (!skb_is_gso(skb)) {
 		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
@@ -412,17 +560,23 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
 			return;
 		}
-	}
-	else {
-		if (unlikely((skb_headlen(skb) - IPOIB_ENCAP_LEN) !=
-			     ((ip_hdr(skb)->ihl + tcp_hdr(skb)->doff) << 2))) {
-				ipoib_warn(priv, "headlen (%d) does not match ip (%d)and "
-					   "tcp headers(%d), dropping skb\n",
-					   skb_headlen(skb) - IPOIB_ENCAP_LEN,
-					   ip_hdr(skb)->ihl << 2, tcp_hdr(skb)->doff << 2);
-			++priv->stats.tx_errors;
-			dev_kfree_skb_any(skb);
-			return;
+		phead = 0;
+		hlen = 0;
+	} else {
+		/*
+		 * LSO header is limited to max 60 bytes
+		 */
+		if (unlikely((ip_hdr(skb)->ihl + tcp_hdr(skb)->doff) > 15)) {
+			ipoib_warn(priv, "ip(%d) and tcp(%d) headers too long, dropping skb\n",
+				   ip_hdr(skb)->ihl << 2, tcp_hdr(skb)->doff << 2);
+			goto drop;
+		}
+
+		hlen = ((ip_hdr(skb)->ihl + tcp_hdr(skb)->doff) << 2) + IPOIB_ENCAP_LEN;
+		phead = skb->data;
+		if (unlikely(!skb_pull(skb, hlen))) {
+			ipoib_warn(priv, "linear data too small\n");
+			goto drop;
 		}
 	}
 
@@ -446,18 +600,13 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 
 	if (priv->ca->flags & IB_DEVICE_IP_CSUM &&
 	    skb->ip_summed == CHECKSUM_PARTIAL)
-		priv->tx_wr.send_flags |=
-			IB_SEND_UDP_TCP_CSUM | IB_SEND_IP_CSUM;
+		priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
 	else
-		priv->tx_wr.send_flags &=
-			~(IB_SEND_UDP_TCP_CSUM | IB_SEND_IP_CSUM);
-
+		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 
 	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
 			       address->ah, qpn,
-			       tx_req->mapping, skb_headlen(skb),
-			       skb_shinfo(skb),
-			       skb_is_gso(skb) ? skb->data : NULL))) {
+			       tx_req, phead, hlen))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++priv->stats.tx_errors;
 		ipoib_dma_unmap_tx(priv->ca, tx_req);
@@ -467,12 +616,24 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 
 		address->last_send = priv->tx_head;
 		++priv->tx_head;
+		skb_orphan(skb);
 
-		if (++priv->tx_outstanding == ipoib_sendq_size) {
+		if (++priv->tx_outstanding == (ipoib_sendq_size - 1)) {
 			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
 			netif_stop_queue(dev);
 		}
 	}
+
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE + 1))
+		poll_tx(priv);
+
+
+	return;
+
+drop:
+	++priv->stats.tx_errors;
+	dev_kfree_skb_any(skb);
+	return;
 }
 
 static void __ipoib_reap_ah(struct net_device *dev)
@@ -542,6 +703,12 @@ int ipoib_ib_dev_open(struct net_device *dev)
 	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
 			   round_jiffies_relative(HZ));
 
+	init_timer(&priv->poll_timer);
+	priv->poll_timer.function = ipoib_ib_tx_timer_func;
+	priv->poll_timer.data = (unsigned long)dev;
+        mod_timer(&priv->poll_timer, jiffies + HZ / 2);
+	set_bit(IPOIB_FLAG_TIME_ON, &priv->flags);
+
 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
 	return 0;
@@ -619,7 +786,7 @@ void ipoib_drain_cq(struct net_device *dev)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int i, n;
 	do {
-		n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc);
+		n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, priv->ibwc);
 		for (i = 0; i < n; ++i) {
 			/*
 			 * Convert any successful completions to flush
@@ -649,13 +816,20 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_attr qp_attr;
 	unsigned long begin;
-	struct ipoib_tx_buf *tx_req;
 	int i;
+	unsigned long flags;
+	int timer_works;
+
+	timer_works = test_and_clear_bit(IPOIB_FLAG_TIME_ON, &priv->flags);
+	if (timer_works)
+		del_timer_sync(&priv->poll_timer);
 
 	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 	netif_poll_disable(dev);
 
 	ipoib_cm_dev_stop(dev);
+	if (timer_works)
+		flush_tx_queue(priv);
 
 	/*
 	 * Move our QP to the error state and then reinitialize in
@@ -665,6 +839,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
 
+	clean_pending_receives(priv);
 	/* Wait for all sends and receives to complete */
 	begin = jiffies;
 
@@ -677,25 +852,14 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 			 * assume the HW is wedged and just free up
 			 * all our pending work requests.
 			 */
-			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
-				tx_req = &priv->tx_ring[priv->tx_tail &
-							(ipoib_sendq_size - 1)];
-				ipoib_dma_unmap_tx(priv->ca, tx_req);
-				dev_kfree_skb_any(tx_req->skb);
-				++priv->tx_tail;
-				--priv->tx_outstanding;
-			}
-
 			for (i = 0; i < ipoib_recvq_size; ++i) {
-				struct ipoib_rx_buf *rx_req;
+				struct ipoib_sg_rx_buf *rx_req;
 
 				rx_req = &priv->rx_ring[i];
 				if (!rx_req->skb)
 					continue;
-				ib_dma_unmap_single(priv->ca,
-						    rx_req->mapping,
-						    IPOIB_BUF_SIZE,
-						    DMA_FROM_DEVICE);
+				ipoib_sg_dma_unmap_rx(priv,
+						      priv->rx_ring[i].mapping);
 				dev_kfree_skb_any(rx_req->skb);
 				rx_req->skb = NULL;
 			}
@@ -703,6 +867,12 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 			goto timeout;
 		}
 
+		if ((int) priv->tx_tail - (int) priv->tx_head < 0) {
+			spin_lock_irqsave(&priv->tx_lock, flags);
+			poll_tx(priv);
+			spin_unlock_irqrestore(&priv->tx_lock, flags);
+		}
+
 		ipoib_drain_cq(dev);
 
 		msleep(1);
@@ -711,6 +881,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 	ipoib_dbg(priv, "All sends and receives done.\n");
 
 timeout:
+	destroy_own_ah(priv);
 	qp_attr.qp_state = IB_QPS_RESET;
 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
@@ -735,7 +906,7 @@ timeout:
 	}
 
 	netif_poll_enable(dev);
-	ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(priv->rcq, IB_CQ_NEXT_COMP);
 
 	return 0;
 }
@@ -794,6 +965,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 			ipoib_ib_dev_down(dev, 0);
+			ipoib_ib_dev_stop(dev, 0);
 			ipoib_pkey_dev_delay_open(dev);
 			return;
 		}
@@ -816,6 +988,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		ipoib_ib_dev_open(dev);
 	}
 
+	destroy_own_ah(priv);
+
 	/*
 	 * The device could have been brought down between the start and when
 	 * we get here, don't bring it back up if it's not configured up
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index e37f788..f30e854 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,7 @@
 #include <linux/in.h>
 
 #include <net/dst.h>
+#include <linux/vmalloc.h>
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
@@ -55,14 +56,11 @@ MODULE_LICENSE("Dual BSD/GPL");
 
 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
-static int ipoib_hw_csum __read_mostly = 0;
 
 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
-module_param_named(hw_csum, ipoib_hw_csum, int, 0444);
-MODULE_PARM_DESC(hw_csum, "Rely on hardware end-to-end checksum (ICRC) if > 0");
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level;
@@ -182,7 +180,10 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	/* dev->mtu > 2K ==> connected mode */
-	if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) {
+	if (ipoib_cm_admin_enabled(dev)) {
+		if (new_mtu > ipoib_cm_max_mtu(dev))
+			return -EINVAL;
+
 		if (new_mtu > priv->mcast_mtu)
 			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 				   priv->mcast_mtu);
@@ -190,9 +191,8 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 		return 0;
 	}
 
-	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) {
+	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 		return -EINVAL;
-	}
 
 	priv->admin_mtu = new_mtu;
 
@@ -713,12 +713,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		neigh = *to_ipoib_neigh(skb->dst->neighbour);
 
-		if (ipoib_cm_get(neigh)) {
-			if (ipoib_cm_up(neigh)) {
-				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
-				goto out;
-			}
-		} else if (neigh->ah) {
+		if (neigh->ah)
 			if (unlikely((memcmp(&neigh->dgid.raw,
 					    skb->dst->neighbour->ha + 4,
 					    sizeof(union ib_gid))) ||
@@ -739,9 +734,14 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				goto out;
 			}
 
+		if (ipoib_cm_get(neigh)) {
+			if (ipoib_cm_up(neigh)) {
+				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+				goto out;
+			}
+		} else if (neigh->ah) {
 			ipoib_send(dev, skb, neigh->ah,
 				   IPOIB_QPN(skb->dst->neighbour->ha));
-
 			goto out;
 		}
 
@@ -815,18 +815,11 @@ static int ipoib_hard_header(struct sk_buff *skb,
 			     void *daddr, void *saddr, unsigned len)
 {
 	struct ipoib_header *header;
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
 
 	header->proto = htons(type);
-        if (!test_bit(IPOIB_FLAG_HW_CSUM, &priv->flags) ||
-	    skb->ip_summed != CHECKSUM_PARTIAL)
-		header->flags = 0;
-	else if (daddr && *((char *)daddr) & IPOIB_FLAGS_HWCSUM)
-		header->flags = cpu_to_be16(IPOIB_HEADER_F_HWCSUM);
-	else
-		skb_checksum_help(skb);
+	header->reserved = 0;
 
 	/*
 	 * If we don't have a neighbour structure, stuff the
@@ -925,42 +918,83 @@ static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *par
 	return 0;
 }
 
+int ipoib_vmalloc(struct ipoib_vmap *buf, int size)
+{
+	int	i;
+	int	npages = ALIGN(size, PAGE_SIZE) / PAGE_SIZE;
+	int	ret = -ENOMEM;
+
+	buf->page_arr = kmalloc(npages * sizeof buf->page_arr[0], GFP_KERNEL);
+	if (!buf->page_arr)
+		goto out;
+
+	for (i = 0; i < npages; ++i) {
+		buf->page_arr[i] = alloc_page(GFP_KERNEL);
+		if (!buf->page_arr[i])
+			goto page_fail;
+	}
+
+	buf->npages = npages;
+	buf->ptr = vmap(buf->page_arr, buf->npages, VM_MAP, PAGE_KERNEL);
+	if (!buf->ptr)
+		goto page_fail;
+
+	memset(buf->ptr, 0, size);
+	return 0;
+
+page_fail:
+	for (; i > 0; --i)
+		__free_page(buf->page_arr[i - 1]);
+
+	kfree(buf->page_arr);
+out:
+	return ret;
+}
+
+void ipoib_vfree(struct ipoib_vmap *buf)
+{
+	int	i;
+
+	vunmap(buf->ptr);
+	for (i = 0; i < buf->npages; ++i)
+		__free_page(buf->page_arr[i]);
+
+	kfree(buf->page_arr);
+}
+
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	/* Allocate RX/TX "rings" to hold queued skbs */
-	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
-				GFP_KERNEL);
-	if (!priv->rx_ring) {
+	if (ipoib_vmalloc(&priv->rx_vmap_ring, ipoib_recvq_size *
+			  sizeof *priv->rx_ring)) {
 		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
 		       ca->name, ipoib_recvq_size);
 		goto out;
 	}
+        priv->rx_ring = priv->rx_vmap_ring.ptr;
 
-	priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring,
-				GFP_KERNEL);
-	if (!priv->tx_ring) {
+	if (ipoib_vmalloc(&priv->tx_vmap_ring, ipoib_sendq_size *
+			  sizeof *priv->tx_ring)) {
 		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
 		       ca->name, ipoib_sendq_size);
 		goto out_rx_ring_cleanup;
 	}
+	priv->tx_ring = priv->tx_vmap_ring.ptr;
 
 	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
 
 	if (ipoib_ib_dev_init(dev, ca, port))
 		goto out_tx_ring_cleanup;
 
-	if (ipoib_hw_csum)
-		dev->dev_addr[0] |= IPOIB_FLAGS_HWCSUM;
-
 	return 0;
 
 out_tx_ring_cleanup:
-	kfree(priv->tx_ring);
+	ipoib_vfree(&priv->tx_vmap_ring);
 
 out_rx_ring_cleanup:
-	kfree(priv->rx_ring);
+	ipoib_vfree(&priv->rx_vmap_ring);
 
 out:
 	return -ENOMEM;
@@ -981,8 +1015,8 @@ void ipoib_dev_cleanup(struct net_device *dev)
 
 	ipoib_ib_dev_cleanup(dev);
 
-	kfree(priv->rx_ring);
-	kfree(priv->tx_ring);
+	ipoib_vfree(&priv->rx_vmap_ring);
+	ipoib_vfree(&priv->tx_vmap_ring);
 
 	priv->rx_ring = NULL;
 	priv->tx_ring = NULL;
@@ -1019,14 +1053,6 @@ static void ipoib_setup(struct net_device *dev)
 	dev->type 		 = ARPHRD_INFINIBAND;
 	dev->tx_queue_len 	 = ipoib_sendq_size * 2;
 	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
-	if (ipoib_hw_csum) {
-		dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
-		set_bit(IPOIB_FLAG_HW_CSUM, &priv->flags);
-	}
-
-	/* MTU will be reset when mcast join happens */
-	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
-	priv->mcast_mtu 	 = priv->admin_mtu = dev->mtu;
 
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
@@ -1065,7 +1091,7 @@ struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
 	return netdev_priv(dev);
 }
 
-static ssize_t show_pkey(struct device *dev, char *buf)
+static ssize_t show_pkey(struct class_device *dev, char *buf)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
 
@@ -1103,7 +1129,7 @@ int ipoib_add_umcast_attr(struct net_device *dev)
 					&class_device_attr_umcast);
 }
 
-static ssize_t create_child(struct device *dev,
+static ssize_t create_child(struct class_device *dev,
 			    const char *buf, size_t count)
 {
 	int pkey;
@@ -1152,7 +1178,7 @@ int ipoib_add_pkey_attr(struct net_device *dev)
 					&class_device_attr_pkey);
 }
 
-static void set_tx_csum(struct net_device *dev, struct ib_device *hca)
+static void set_csum(struct net_device *dev, struct ib_device *hca)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
@@ -1162,23 +1188,15 @@ static void set_tx_csum(struct net_device *dev, struct ib_device *hca)
 	if (!(hca->flags & IB_DEVICE_IP_CSUM))
 		return;
 
-	dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; /* turn on ipv6 too */
-}
-
-static void set_rx_csum(struct net_device *dev, struct ib_device *hca)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-
-	if (!(hca->flags & IB_DEVICE_IP_CSUM))
-		return;
-
-	set_bit(IPOIB_FLAG_RX_CSUM, &priv->flags);
+	dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
+	set_bit(IPOIB_FLAG_CSUM, &priv->flags);
 }
 
 static struct net_device *ipoib_add_port(const char *format,
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
 	priv = ipoib_intf_alloc(format);
@@ -1189,6 +1207,18 @@ static struct net_device *ipoib_add_port(const char *format,
 
 	priv->dev->features |= NETIF_F_HIGHDMA;
 
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
+		       hca->name, port);
+		goto device_init_failed;
+	}
+
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
@@ -1213,8 +1243,7 @@ static struct net_device *ipoib_add_port(const char *format,
 	} else
 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
 
-	set_tx_csum(priv->dev, hca);
-	set_rx_csum(priv->dev, hca);
+	set_csum(priv->dev, hca);
 
 	result = ipoib_dev_init(priv->dev, hca, port);
 	if (result < 0) {
@@ -1233,8 +1262,7 @@ static struct net_device *ipoib_add_port(const char *format,
 		goto event_failed;
 	}
 
-	if (!ipoib_hw_csum && priv->dev->features & NETIF_F_SG &&
-	    priv->ca->flags & IB_DEVICE_TCP_TSO)
+	if (priv->dev->features & NETIF_F_SG && priv->ca->flags & IB_DEVICE_TCP_TSO)
 		priv->dev->features |= NETIF_F_TSO;
 
 
@@ -1348,6 +1376,9 @@ static int __init ipoib_init_module(void)
 	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
 	ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE);
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
+#endif
 
 	ret = ipoib_register_debugfs();
 	if (ret)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 64a6fcd..1bf45b0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -492,6 +492,42 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 	}
 }
 
+static int create_own_ah(struct ipoib_dev_priv *priv)
+{
+	struct ib_ah_attr attr = {
+		.dlid = priv->local_lid,
+		.port_num = priv->port,
+	};
+        struct ib_ah *ah;
+
+	if (priv->own_ah)
+		return 0;
+
+	ah = ib_create_ah(priv->pd, &attr);
+	if (!IS_ERR(ah)) {
+		ipoib_dbg(priv, "created own ah\n");
+		priv->own_ah = ah;
+	}
+
+	return IS_ERR(priv->own_ah);
+}
+
+void destroy_own_ah(struct ipoib_dev_priv *priv)
+{
+	unsigned long flags;
+
+	if (!priv->own_ah) {
+		ipoib_dbg(priv, "own ah already destroyed\n");
+		return;
+	} else
+		ipoib_dbg(priv, "destroying own ah\n");
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	ib_destroy_ah(priv->own_ah);
+	priv->own_ah = NULL;
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
 void ipoib_mcast_join_task(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
@@ -509,8 +545,11 @@ void ipoib_mcast_join_task(struct work_struct *work)
 	{
 		struct ib_port_attr attr;
 
-		if (!ib_query_port(priv->ca, priv->port, &attr))
+		if (!ib_query_port(priv->ca, priv->port, &attr)) {
 			priv->local_lid = attr.lid;
+			if (create_own_ah(priv))
+				ipoib_warn(priv, "create own_ah failed\n");
+		}
 		else
 			ipoib_warn(priv, "ib_query_port failed\n");
 	}
@@ -567,8 +606,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
 		return;
 	}
 
-	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
-		IPOIB_ENCAP_LEN;
+	priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
 
 	if (!ipoib_cm_admin_enabled(dev))
 		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 1547d38..d3bdbd3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -34,6 +34,7 @@
  */
 
 #include "ipoib.h"
+#include <linux/ethtool.h>
 
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
 {
@@ -150,13 +151,15 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 			.max_send_wr  = ipoib_sendq_size,
 			.max_recv_wr  = ipoib_recvq_size,
 			.max_send_sge = dev->features & NETIF_F_SG ? MAX_SKB_FRAGS + 1 : 1,
-			.max_recv_sge = 1
+			.max_recv_sge = IPOIB_UD_RX_SG
 		},
-		.sq_sig_type = IB_SIGNAL_ALL_WR,
-		.qp_type     = IB_QPT_UD
+		.sq_sig_type = IB_SIGNAL_REQ_WR,
+		.qp_type     = IB_QPT_UD,
+		.create_flags = QP_CREATE_LSO,
 	};
 
 	int i, ret, size;
+	struct ethtool_coalesce *coal;
 
 	priv->pd = ib_alloc_pd(priv->ca);
 	if (IS_ERR(priv->pd)) {
@@ -170,30 +173,42 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 		goto out_free_pd;
 	}
 
-	size = ipoib_sendq_size + ipoib_recvq_size + 1;
+	size = ipoib_sendq_size + ipoib_recvq_size;
 	ret = ipoib_cm_dev_init(dev);
 	if (!ret)
-		size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */;
+		size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */
 
-	priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
-	if (IS_ERR(priv->cq)) {
-		printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
+	priv->rcq = ib_create_cq(priv->ca, ipoib_ib_rx_completion, NULL, dev, size, 0);
+	if (IS_ERR(priv->rcq)) {
+		printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
 		goto out_free_mr;
 	}
 
-	if (ib_modify_cq(priv->cq, 16, 10))
-		printk(KERN_INFO "%s: failed to modify CQ params\n", ca->name);
+	priv->scq = ib_create_cq(priv->ca, NULL, NULL, dev, ipoib_sendq_size, 0);
+	if (IS_ERR(priv->scq)) {
+		printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
+		goto out_free_rcq;
+	}
+
 
-	if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
-		goto out_free_cq;
+	coal = kzalloc(sizeof *coal, GFP_KERNEL);
+	if (coal) {
+		coal->rx_coalesce_usecs = 10;
+		coal->rx_max_coalesced_frames = 16;
+		dev->ethtool_ops->set_coalesce(dev, coal);
+		kfree(coal);
+	}
 
-	init_attr.send_cq = priv->cq;
-	init_attr.recv_cq = priv->cq;
+	if (ib_req_notify_cq(priv->rcq, IB_CQ_NEXT_COMP))
+		goto out_free_scq;
+
+	init_attr.send_cq = priv->scq;
+	init_attr.recv_cq = priv->rcq;
 
 	priv->qp = ib_create_qp(priv->pd, &init_attr);
 	if (IS_ERR(priv->qp)) {
 		printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
-		goto out_free_cq;
+		goto out_free_rcq;
 	}
 
 	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
@@ -207,10 +222,35 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	priv->tx_wr.sg_list 	= priv->tx_sge;
 	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
 
+	for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+		priv->sglist_draft[i][0].lkey = priv->mr->lkey;
+		priv->sglist_draft[i][1].lkey = priv->mr->lkey;
+		priv->rx_wr_draft[i].sg_list = &priv->sglist_draft[i][0];
+		if (i < UD_POST_RCV_COUNT - 1)
+			priv->rx_wr_draft[i].next = &priv->rx_wr_draft[i + 1];
+	}
+	priv->rx_wr_draft[i].next = NULL;
+
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+			priv->sglist_draft[i][0].length = IPOIB_UD_HEAD_SIZE;
+			priv->sglist_draft[i][1].length = PAGE_SIZE;
+			priv->rx_wr_draft[i].num_sge = IPOIB_UD_RX_SG;
+		}
+	} else {
+		for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+			priv->sglist_draft[i][0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+			priv->rx_wr_draft[i].num_sge = 1;
+		}
+	}
+
 	return 0;
 
-out_free_cq:
-	ib_destroy_cq(priv->cq);
+out_free_scq:
+	ib_destroy_cq(priv->scq);
+
+out_free_rcq:
+	ib_destroy_cq(priv->rcq);
 
 out_free_mr:
 	ib_dereg_mr(priv->mr);
@@ -233,7 +273,10 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 	}
 
-	if (ib_destroy_cq(priv->cq))
+	if (ib_destroy_cq(priv->scq))
+		ipoib_warn(priv, "ib_cq_destroy failed\n");
+
+	if (ib_destroy_cq(priv->rcq))
 		ipoib_warn(priv, "ib_cq_destroy failed\n");
 
 	ipoib_cm_dev_cleanup(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index b7d50a3..fa288e2 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -90,6 +90,10 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 		goto err;
 	}
 
+	priv->max_ib_mtu = ppriv->max_ib_mtu;
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
 	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
 
 	priv->pkey = pkey;