Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 1590

kernel-2.6.18-238.el5.src.rpm

From: Mike Christie <mchristi@redhat.com>
Date: Thu, 12 Aug 2010 21:42:28 -0400
Subject: [infiniband] sync iser driver with upstream for RHEL5.6
Message-id: <1281649348-15803-1-git-send-email-mchristi@redhat.com>
Patchwork-id: 27531
O-Subject: [RHEL 5.6 PATCH]: sync iser with upstream
Bugzilla: 623595
RH-Acked-by: Jarod Wilson <jarod@redhat.com>

From: Mike Christie <mchristi@redhat.com>

This is for bz 623595.

>From iser maintainer:

The first set removes some inefficiencies in the iser data path through
simplification and reducing the amount of code, using less atomic
operations, avoiding TX interrupts, moving to iscsi passthrough mode,
etc.
As a result I was able to duplicate the amount of IOPS, upto about 300K
and to reduce the latency measured through iostat AWAIT report.
The upstream submission was done through a sequence of patches
and not as one big re-write, to ease with maintainance and tuning.

The second set fixes bugs in the disconnection logic of iser which
made the failover time with dm-multipathing to be too long, at least 120
seconds, wheres now it can get to be 30 seconds and below that.

With iser under the RHEL5 series using libiscsi2, the upstream patches
were
modified in few places to issue iscsi2_xxx calls instread of iscsi_xxx
ones.

commit 1cef4659850eeb862c248c7670e404d7a1711ed1
    IB/iser: Revert commit bba7ebb "avoid recv buffer exhaustion"
commit bcc60c381d857ced653e912cbe6121294773e147
    IB/iser: New receive buffer posting logic
commit 704315f082d473b34047817f0a6a01924f38501e
    IB/iser: Remove atomic counter for posted receive buffers
commit 78ad0a34dc138047529058c5f2265664cb70a052
    IB/iser: Use different CQ for send completions
commit f19624aa92003969ba822cd3c552800965aa530b
    IB/iser: Simplify send flow/descriptors
commit 528f4e8c8341706a354ff96daf615e678e9b296f
    IB/iser: Use atomic allocations
commit aae3c995ff74a183d15207436d383942485b2edd
    IB/iser: Remove unnecessary connection checks
commit 962b4b528ba87c8d837bb04794a1918c7de631cd
    IB/iser: Use libiscsi passthrough mode
commit 88ec415772144f4fc4a50b123bb6200de686898d
    IB/iser: Remove redundant locking from iser scsi command response
flow

commit 2110f9bf37511df06220bb7e977f417baecf2950
    IB/iser: Add asynchronous event handler
commit d265b9808272c9f25e1c36d3fb5ddb466efd90e9
    IB/iser: Remove buggy back-pointer setting
commit 39ff05dbbbdb082bbabf06206c56b3cd4ef73904
    IB/iser: Enhance disconnection logic for multi-pathing

The iser maintainer at Voltaire made and tested the patch.
I have done some basic IO testing and logout/login and
cable pull tests to stress the disconnect changes.

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index e671927..f97caf5 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -128,6 +128,28 @@ static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
 	return 0;
 }
 
+int iser_initialize_task_headers(struct iscsi_task *task,
+						struct iser_tx_desc *tx_desc)
+{
+	struct iscsi_iser_conn *iser_conn = task->conn->dd_data;
+	struct iser_device     *device    = iser_conn->ib_conn->device;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	u64 dma_addr;
+
+	dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+				ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device, dma_addr))
+		return -ENOMEM;
+
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+
+	iser_task->headers_initialized	= 1;
+	iser_task->iser_conn		= iser_conn;
+	return 0;
+}
 /**
  * iscsi_iser_task_init - Initialize task
  * @task: iscsi task
@@ -137,17 +159,17 @@ static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
 static int
 iscsi_iser_task_init(struct iscsi_task *task)
 {
-	struct iscsi_iser_conn *iser_conn  = task->conn->dd_data;
 	struct iscsi_iser_task *iser_task = task->dd_data;
 
+	if (!iser_task->headers_initialized)
+		if (iser_initialize_task_headers(task, &iser_task->desc))
+			return -ENOMEM;
+
 	/* mgmt task */
-	if (!task->sc) {
-		iser_task->desc.data = task->data;
+	if (!task->sc)
 		return 0;
-	}
 
 	iser_task->command_sent = 0;
-	iser_task->iser_conn    = iser_conn;
 	iser_task_rdma_init(iser_task);
 	return 0;
 }
@@ -168,7 +190,7 @@ iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
 {
 	int error = 0;
 
-	iser_dbg("task deq [cid %d itt 0x%x]\n", conn->id, task->itt);
+	iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt);
 
 	error = iser_send_control(conn, task);
 
@@ -178,9 +200,6 @@ iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
 	 * - if yes, the task is recycled at iscsi_complete_pdu
 	 * - if no,  the task is recycled at iser_snd_completion
 	 */
-	if (error && error != -ENOBUFS)
-		iscsi2_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
-
 	return error;
 }
 
@@ -232,7 +251,7 @@ iscsi_iser_task_xmit(struct iscsi_task *task)
 			   task->imm_count, task->unsol_r2t.data_length);
 	}
 
-	iser_dbg("task deq [cid %d itt 0x%x]\n",
+	iser_dbg("ctask xmit [cid %d itt 0x%x]\n",
 		   conn->id, task->itt);
 
 	/* Send the cmd PDU */
@@ -248,8 +267,6 @@ iscsi_iser_task_xmit(struct iscsi_task *task)
 		error = iscsi_iser_task_xmit_unsol_data(conn, task);
 
  iscsi_iser_task_xmit_exit:
-	if (error && error != -ENOBUFS)
-		iscsi2_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
 	return error;
 }
 
@@ -283,7 +300,7 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx)
 	 * due to issues with the login code re iser sematics
 	 * this not set in iscsi_conn_setup - FIXME
 	 */
-	conn->max_recv_dlength = 128;
+	conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN;
 
 	iser_conn = conn->dd_data;
 	conn->dd_data = iser_conn;
@@ -307,7 +324,7 @@ iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn)
 	 */
 	if (ib_conn) {
 		ib_conn->iser_conn = NULL;
-		iser_conn_put(ib_conn);
+		iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */
 	}
 }
 
@@ -339,11 +356,12 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
 	/* binds the iSER connection retrieved from the previously
 	 * connected ep_handle to the iSCSI layer connection. exchanges
 	 * connection pointers */
-	iser_err("binding iscsi conn %p to iser_conn %p\n",conn,ib_conn);
+	iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n",
+					conn, conn->dd_data, ib_conn);
 	iser_conn = conn->dd_data;
 	ib_conn->iser_conn = iser_conn;
 	iser_conn->ib_conn  = ib_conn;
-	iser_conn_get(ib_conn);
+	iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */
 	return 0;
 }
 
@@ -364,7 +382,7 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
 		 * There is no unbind event so the stop callback
 		 * must release the ref from the bind.
 		 */
-		iser_conn_put(ib_conn);
+		iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */
 	}
 	iser_conn->ib_conn = NULL;
 }
@@ -401,7 +419,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 	struct Scsi_Host *shost;
 	struct iser_conn *ib_conn;
 
-	shost = iscsi2_host_alloc(&iscsi_iser_sht, 0, 1);
+	shost = iscsi2_host_alloc(&iscsi_iser_sht, 0, 0);
 	if (!shost)
 		return NULL;
 	shost->transportt = iscsi_iser_scsi_transport;
@@ -673,7 +691,7 @@ static int __init iser_init(void)
 	memset(&ig, 0, sizeof(struct iser_global));
 
 	ig.desc_cache = kmem_cache_create("iser_descriptors",
-					  sizeof (struct iser_desc),
+					  sizeof(struct iser_tx_desc),
 					  0, SLAB_HWCACHE_ALIGN,
 					  NULL);
 	if (ig.desc_cache == NULL)
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 139cde8..7bbc74a 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -102,9 +102,9 @@
 #define ISER_MAX_TX_MISC_PDUS		6 /* NOOP_OUT(2), TEXT(1),         *
 					   * SCSI_TMFUNC(2), LOGOUT(1) */
 
-#define ISER_QP_MAX_RECV_DTOS		(ISCSI_DEF_XMIT_CMDS_MAX + \
-					ISER_MAX_RX_MISC_PDUS    +  \
-					ISER_MAX_TX_MISC_PDUS)
+#define ISER_QP_MAX_RECV_DTOS		(ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISER_MIN_POSTED_RX		(ISCSI_DEF_XMIT_CMDS_MAX >> 2)
 
 /* the max TX (send) WR supported by the iSER QP is defined by                 *
  * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
@@ -132,6 +132,12 @@ struct iser_hdr {
 	__be64  read_va;
 } __attribute__((packed));
 
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN	128
+#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
 
 /* Length of an object name string */
 #define ISER_OBJECT_NAME_SIZE		    64
@@ -187,53 +193,46 @@ struct iser_regd_buf {
 	struct iser_mem_reg     reg;        /* memory registration info        */
 	void                    *virt_addr;
 	struct iser_device      *device;    /* device->device for dma_unmap    */
-	u64                     dma_addr;   /* if non zero, addr for dma_unmap */
 	enum dma_data_direction direction;  /* direction for dma_unmap	       */
 	unsigned int            data_size;
-	atomic_t                ref_count;  /* refcount, freed when dec to 0   */
-};
-
-#define MAX_REGD_BUF_VECTOR_LEN	2
-
-struct iser_dto {
-	struct iscsi_iser_task *task;
-	struct iser_conn *ib_conn;
-	int                        notify_enable;
-
-	/* vector of registered buffers */
-	unsigned int               regd_vector_len;
-	struct iser_regd_buf       *regd[MAX_REGD_BUF_VECTOR_LEN];
-
-	/* offset into the registered buffer may be specified */
-	unsigned int               offset[MAX_REGD_BUF_VECTOR_LEN];
-
-	/* a smaller size may be specified, if 0, then full size is used */
-	unsigned int               used_sz[MAX_REGD_BUF_VECTOR_LEN];
 };
 
 enum iser_desc_type {
-	ISCSI_RX,
 	ISCSI_TX_CONTROL ,
 	ISCSI_TX_SCSI_COMMAND,
 	ISCSI_TX_DATAOUT
 };
 
-struct iser_desc {
+struct iser_tx_desc {
 	struct iser_hdr              iser_header;
 	struct iscsi_hdr             iscsi_header;
-	struct iser_regd_buf         hdr_regd_buf;
-	void                         *data;         /* used by RX & TX_CONTROL */
-	struct iser_regd_buf         data_regd_buf; /* used by RX & TX_CONTROL */
 	enum   iser_desc_type        type;
-	struct iser_dto              dto;
+	u64		             dma_addr;
+	/* sg[0] points to iser/iscsi headers, sg[1] optionally points to either
+	of immediate data, unsolicited data-out or control (login,text) */
+	struct ib_sge		     tx_sg[2];
+	int                          num_sge;
 };
 
+#define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
+					sizeof(u64) + sizeof(struct ib_sge)))
+struct iser_rx_desc {
+	struct iser_hdr              iser_header;
+	struct iscsi_hdr             iscsi_header;
+	char		             data[ISER_RECV_DATA_SEG_LEN];
+	u64		             dma_addr;
+	struct ib_sge		     rx_sg;
+	char		             pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
 struct iser_device {
 	struct ib_device             *ib_device;
 	struct ib_pd	             *pd;
-	struct ib_cq	             *cq;
+	struct ib_cq	             *rx_cq;
+	struct ib_cq	             *tx_cq;
 	struct ib_mr	             *mr;
 	struct tasklet_struct	     cq_tasklet;
+	struct ib_event_handler      event_handler;
 	struct list_head             ig_list; /* entry in ig devices list */
 	int                          refcount;
 };
@@ -248,17 +247,19 @@ struct iser_conn {
 	struct rdma_cm_id            *cma_id;       /* CMA ID		       */
 	struct ib_qp	             *qp;           /* QP 		       */
 	struct ib_fmr_pool           *fmr_pool;     /* pool of IB FMRs         */
-	int                          disc_evt_flag; /* disconn event delivered */
 	wait_queue_head_t	     wait;          /* waitq for conn/disconn  */
-	atomic_t                     post_recv_buf_count; /* posted rx count   */
+	int                          post_recv_buf_count; /* posted rx count  */
 	atomic_t                     post_send_buf_count; /* posted tx count   */
-	atomic_t                     unexpected_pdu_count;/* count of received *
-							   * unexpected pdus   *
-							   * not yet retired   */
 	char 			     name[ISER_OBJECT_NAME_SIZE];
 	struct iser_page_vec         *page_vec;     /* represents SG to fmr maps*
 						     * maps serialized as tx is*/
 	struct list_head	     conn_list;       /* entry in ig conn list */
+
+	char  			     *login_buf;
+	u64 			     login_dma;
+	unsigned int 		     rx_desc_head;
+	struct iser_rx_desc	     *rx_descs;
+	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
 };
 
 struct iscsi_iser_conn {
@@ -267,7 +268,7 @@ struct iscsi_iser_conn {
 };
 
 struct iscsi_iser_task {
-	struct iser_desc             desc;
+	struct iser_tx_desc          desc;
 	struct iscsi_iser_conn	     *iser_conn;
 	enum iser_task_status 	     status;
 	int                          command_sent;  /* set if command  sent  */
@@ -275,6 +276,7 @@ struct iscsi_iser_task {
 	struct iser_regd_buf         rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */
 	struct iser_data_buf         data[ISER_DIRS_NUM];     /* orig. data des*/
 	struct iser_data_buf         data_copy[ISER_DIRS_NUM];/* contig. copy  */
+	int                          headers_initialized;
 };
 
 struct iser_page_vec {
@@ -318,26 +320,21 @@ void iser_conn_init(struct iser_conn *ib_conn);
 
 void iser_conn_get(struct iser_conn *ib_conn);
 
-void iser_conn_put(struct iser_conn *ib_conn);
+int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed);
 
 void iser_conn_terminate(struct iser_conn *ib_conn);
 
-void iser_rcv_completion(struct iser_desc *desc,
-			 unsigned long    dto_xfer_len);
+void iser_rcv_completion(struct iser_rx_desc *desc,
+			 unsigned long    dto_xfer_len,
+			struct iser_conn *ib_conn);
 
-void iser_snd_completion(struct iser_desc *desc);
+void iser_snd_completion(struct iser_tx_desc *desc, struct iser_conn *ib_conn);
 
 void iser_task_rdma_init(struct iscsi_iser_task *task);
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *task);
 
-void iser_dto_buffs_release(struct iser_dto *dto);
-
-int  iser_regd_buff_release(struct iser_regd_buf *regd_buf);
-
-void iser_reg_single(struct iser_device      *device,
-		     struct iser_regd_buf    *regd_buf,
-		     enum dma_data_direction direction);
+void iser_free_rx_descriptors(struct iser_conn *ib_conn);
 
 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task,
 				     enum iser_data_dir         cmd_dir);
@@ -356,11 +353,9 @@ int  iser_reg_page_vec(struct iser_conn     *ib_conn,
 
 void iser_unreg_mem(struct iser_mem_reg *mem_reg);
 
-int  iser_post_recv(struct iser_desc *rx_desc);
-int  iser_post_send(struct iser_desc *tx_desc);
-
-int iser_conn_state_comp(struct iser_conn *ib_conn,
-			 enum iser_ib_conn_state comp);
+int  iser_post_recvl(struct iser_conn *ib_conn);
+int  iser_post_recvm(struct iser_conn *ib_conn, int count);
+int  iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc);
 
 int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
 			    struct iser_data_buf       *data,
@@ -368,4 +363,6 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
 			    enum   dma_data_direction  dma_dir);
 
 void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task);
+int  iser_initialize_task_headers(struct iscsi_task *task,
+			struct iser_tx_desc *tx_desc);
 #endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 8ba0c3b..52c84cb 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -39,29 +39,6 @@
 
 #include "iscsi_iser.h"
 
-/* Constant PDU lengths calculations */
-#define ISER_TOTAL_HEADERS_LEN  (sizeof (struct iser_hdr) + \
-				 sizeof (struct iscsi_hdr))
-
-/* iser_dto_add_regd_buff - increments the reference count for *
- * the registered buffer & adds it to the DTO object           */
-static void iser_dto_add_regd_buff(struct iser_dto *dto,
-				   struct iser_regd_buf *regd_buf,
-				   unsigned long use_offset,
-				   unsigned long use_size)
-{
-	int add_idx;
-
-	atomic_inc(&regd_buf->ref_count);
-
-	add_idx = dto->regd_vector_len;
-	dto->regd[add_idx] = regd_buf;
-	dto->used_sz[add_idx] = use_size;
-	dto->offset[add_idx] = use_offset;
-
-	dto->regd_vector_len++;
-}
-
 /* Register user buffer memory and initialize passive rdma
  *  dto descriptor. Total data size is stored in
  *  iser_task->data[ISER_DIR_IN].data_len
@@ -122,9 +99,9 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 	struct iscsi_iser_task *iser_task = task->dd_data;
 	struct iser_regd_buf *regd_buf;
 	int err;
-	struct iser_dto *send_dto = &iser_task->desc.dto;
 	struct iser_hdr *hdr = &iser_task->desc.iser_header;
 	struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
+	struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
 
 	err = iser_dma_map_task_data(iser_task,
 				     buf_out,
@@ -163,135 +140,100 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 	if (imm_sz > 0) {
 		iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n",
 			 task->itt, imm_sz);
-		iser_dto_add_regd_buff(send_dto,
-				       regd_buf,
-				       0,
-				       imm_sz);
+		tx_dsg->addr   = regd_buf->reg.va;
+		tx_dsg->length = imm_sz;
+		tx_dsg->lkey   = regd_buf->reg.lkey;
+		iser_task->desc.num_sge = 2;
 	}
 
 	return 0;
 }
 
-/**
- * iser_post_receive_control - allocates, initializes and posts receive DTO.
- */
-static int iser_post_receive_control(struct iscsi_conn *conn)
+/* creates a new tx descriptor and adds header regd buffer */
+static void iser_create_send_desc(struct iser_conn	*ib_conn,
+				  struct iser_tx_desc	*tx_desc)
 {
-	struct iscsi_iser_conn *iser_conn = conn->dd_data;
-	struct iser_desc     *rx_desc;
-	struct iser_regd_buf *regd_hdr;
-	struct iser_regd_buf *regd_data;
-	struct iser_dto      *recv_dto = NULL;
-	struct iser_device  *device = iser_conn->ib_conn->device;
-	int rx_data_size, err;
-	int posts, outstanding_unexp_pdus;
-
-	/* for the login sequence we must support rx of upto 8K; login is done
-	 * after conn create/bind (connect) and conn stop/bind (reconnect),
-	 * what's common for both schemes is that the connection is not started
-	 */
-	if (conn->c_stage != ISCSI_CONN_STARTED)
-		rx_data_size = ISCSI_DEF_MAX_RECV_SEG_LEN;
-	else /* FIXME till user space sets conn->max_recv_dlength correctly */
-		rx_data_size = 128;
-
-	outstanding_unexp_pdus =
-		atomic_xchg(&iser_conn->ib_conn->unexpected_pdu_count, 0);
-
-	/*
-	 * in addition to the response buffer, replace those consumed by
-	 * unexpected pdus.
-	 */
-	for (posts = 0; posts < 1 + outstanding_unexp_pdus; posts++) {
-		rx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO);
-		if (rx_desc == NULL) {
-			iser_err("Failed to alloc desc for post recv %d\n",
-				 posts);
-			err = -ENOMEM;
-			goto post_rx_cache_alloc_failure;
-		}
-		rx_desc->type = ISCSI_RX;
-		rx_desc->data = kmalloc(rx_data_size, GFP_NOIO);
-		if (rx_desc->data == NULL) {
-			iser_err("Failed to alloc data buf for post recv %d\n",
-				 posts);
-			err = -ENOMEM;
-			goto post_rx_kmalloc_failure;
-		}
-
-		recv_dto = &rx_desc->dto;
-		recv_dto->ib_conn = iser_conn->ib_conn;
-		recv_dto->regd_vector_len = 0;
+	struct iser_device *device = ib_conn->device;
 
-		regd_hdr = &rx_desc->hdr_regd_buf;
-		memset(regd_hdr, 0, sizeof(struct iser_regd_buf));
-		regd_hdr->device  = device;
-		regd_hdr->virt_addr  = rx_desc; /* == &rx_desc->iser_header */
-		regd_hdr->data_size  = ISER_TOTAL_HEADERS_LEN;
+	ib_dma_sync_single_for_cpu(device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
-		iser_reg_single(device, regd_hdr, DMA_FROM_DEVICE);
-
-		iser_dto_add_regd_buff(recv_dto, regd_hdr, 0, 0);
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+	tx_desc->iser_header.flags = ISER_VER;
 
-		regd_data = &rx_desc->data_regd_buf;
-		memset(regd_data, 0, sizeof(struct iser_regd_buf));
-		regd_data->device  = device;
-		regd_data->virt_addr  = rx_desc->data;
-		regd_data->data_size  = rx_data_size;
+	tx_desc->num_sge = 1;
 
-		iser_reg_single(device, regd_data, DMA_FROM_DEVICE);
+	if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
+		tx_desc->tx_sg[0].lkey = device->mr->lkey;
+		iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
+	}
+}
 
-		iser_dto_add_regd_buff(recv_dto, regd_data, 0, 0);
 
-		err = iser_post_recv(rx_desc);
-		if (err) {
-			iser_err("Failed iser_post_recv for post %d\n", posts);
-			goto post_rx_post_recv_failure;
-		}
+int iser_alloc_rx_descriptors(struct iser_conn *ib_conn)
+{
+	int i, j;
+	u64 dma_addr;
+	struct iser_rx_desc *rx_desc;
+	struct ib_sge       *rx_sg;
+	struct iser_device  *device = ib_conn->device;
+
+	ib_conn->rx_descs = kmalloc(ISER_QP_MAX_RECV_DTOS *
+				sizeof(struct iser_rx_desc), GFP_KERNEL);
+	if (!ib_conn->rx_descs)
+		goto rx_desc_alloc_fail;
+
+	rx_desc = ib_conn->rx_descs;
+
+	for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+		dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,
+					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(device->ib_device, dma_addr))
+			goto rx_desc_dma_map_failed;
+
+		rx_desc->dma_addr = dma_addr;
+
+		rx_sg = &rx_desc->rx_sg;
+		rx_sg->addr   = rx_desc->dma_addr;
+		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+		rx_sg->lkey   = device->mr->lkey;
 	}
-	/* all posts successful */
-	return 0;
 
-post_rx_post_recv_failure:
-	iser_dto_buffs_release(recv_dto);
-	kfree(rx_desc->data);
-post_rx_kmalloc_failure:
-	kmem_cache_free(ig.desc_cache, rx_desc);
-post_rx_cache_alloc_failure:
-	if (posts > 0) {
-		/*
-		 * response buffer posted, but did not replace all unexpected
-		 * pdu recv bufs. Ignore error, retry occurs next send
-		 */
-		outstanding_unexp_pdus -= (posts - 1);
-		err = 0;
-	}
-	atomic_add(outstanding_unexp_pdus,
-		   &iser_conn->ib_conn->unexpected_pdu_count);
+	ib_conn->rx_desc_head = 0;
+	return 0;
 
-	return err;
+rx_desc_dma_map_failed:
+	rx_desc = ib_conn->rx_descs;
+	for (j = 0; j < i; j++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(ib_conn->rx_descs);
+	ib_conn->rx_descs = NULL;
+rx_desc_alloc_fail:
+	iser_err("failed allocating rx descriptors / data buffers\n");
+	return -ENOMEM;
 }
 
-/* creates a new tx descriptor and adds header regd buffer */
-static void iser_create_send_desc(struct iscsi_iser_conn *iser_conn,
-				  struct iser_desc       *tx_desc)
+void iser_free_rx_descriptors(struct iser_conn *ib_conn)
 {
-	struct iser_regd_buf *regd_hdr = &tx_desc->hdr_regd_buf;
-	struct iser_dto      *send_dto = &tx_desc->dto;
+	int i;
+	struct iser_rx_desc *rx_desc;
+	struct iser_device *device = ib_conn->device;
 
-	memset(regd_hdr, 0, sizeof(struct iser_regd_buf));
-	regd_hdr->device  = iser_conn->ib_conn->device;
-	regd_hdr->virt_addr  = tx_desc; /* == &tx_desc->iser_header */
-	regd_hdr->data_size  = ISER_TOTAL_HEADERS_LEN;
+	if (ib_conn->login_buf) {
+		ib_dma_unmap_single(device->ib_device, ib_conn->login_dma,
+			ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+		kfree(ib_conn->login_buf);
+	}
 
-	send_dto->ib_conn         = iser_conn->ib_conn;
-	send_dto->notify_enable   = 1;
-	send_dto->regd_vector_len = 0;
+	if (!ib_conn->rx_descs)
+		return;
 
-	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
-	tx_desc->iser_header.flags = ISER_VER;
-
-	iser_dto_add_regd_buff(send_dto, regd_hdr, 0, 0);
+	rx_desc = ib_conn->rx_descs;
+	for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(ib_conn->rx_descs);
 }
 
 /**
@@ -301,46 +243,23 @@ int iser_conn_set_full_featured_mode(struct iscsi_conn *conn)
 {
 	struct iscsi_iser_conn *iser_conn = conn->dd_data;
 
-	int i;
-	/*
-	 * FIXME this value should be declared to the target during login with
-	 * the MaxOutstandingUnexpectedPDUs key when supported
-	 */
-	int initial_post_recv_bufs_num = ISER_MAX_RX_MISC_PDUS;
-
-	iser_dbg("Initially post: %d\n", initial_post_recv_bufs_num);
+	iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX);
 
 	/* Check that there is no posted recv or send buffers left - */
 	/* they must be consumed during the login phase */
-	BUG_ON(atomic_read(&iser_conn->ib_conn->post_recv_buf_count) != 0);
+	BUG_ON(iser_conn->ib_conn->post_recv_buf_count != 0);
 	BUG_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0);
 
-	/* Initial post receive buffers */
-	for (i = 0; i < initial_post_recv_bufs_num; i++) {
-		if (iser_post_receive_control(conn) != 0) {
-			iser_err("Failed to post recv bufs at:%d conn:0x%p\n",
-				 i, conn);
-			return -ENOMEM;
-		}
-	}
-	iser_dbg("Posted %d post recv bufs, conn:0x%p\n", i, conn);
-	return 0;
-}
+	if (iser_alloc_rx_descriptors(iser_conn->ib_conn))
+		return -ENOMEM;
 
-static int
-iser_check_xmit(struct iscsi_conn *conn, void *task)
-{
-	struct iscsi_iser_conn *iser_conn = conn->dd_data;
+	/* Initial post receive buffers */
+	if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX))
+		return -ENOMEM;
 
-	if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) ==
-	    ISER_QP_MAX_REQ_DTOS) {
-		iser_dbg("%ld can't xmit task %p\n",jiffies,task);
-		return -ENOBUFS;
-	}
 	return 0;
 }
 
-
 /**
  * iser_send_command - send command PDU
  */
@@ -349,27 +268,18 @@ int iser_send_command(struct iscsi_conn *conn,
 {
 	struct iscsi_iser_conn *iser_conn = conn->dd_data;
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_dto *send_dto = NULL;
 	unsigned long edtl;
-	int err = 0;
+	int err;
 	struct iser_data_buf *data_buf;
 	struct iscsi_cmd *hdr =  (struct iscsi_cmd *)task->hdr;
 	struct scsi_cmnd *sc  =  task->sc;
-
-	if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-		iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-		return -EPERM;
-	}
-	if (iser_check_xmit(conn, task))
-		return -ENOBUFS;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
 
 	edtl = ntohl(hdr->data_length);
 
 	/* build the tx desc regd header and add it to the tx desc dto */
-	iser_task->desc.type = ISCSI_TX_SCSI_COMMAND;
-	send_dto = &iser_task->desc.dto;
-	send_dto->task = iser_task;
-	iser_create_send_desc(iser_conn, &iser_task->desc);
+	tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+	iser_create_send_desc(iser_conn->ib_conn, tx_desc);
 
 	if (hdr->flags & ISCSI_FLAG_CMD_READ)
 		data_buf = &iser_task->data[ISER_DIR_IN];
@@ -398,23 +308,13 @@ int iser_send_command(struct iscsi_conn *conn,
 			goto send_command_error;
 	}
 
-	iser_reg_single(iser_conn->ib_conn->device,
-			send_dto->regd[0], DMA_TO_DEVICE);
-
-	if (iser_post_receive_control(conn) != 0) {
-		iser_err("post_recv failed!\n");
-		err = -ENOMEM;
-		goto send_command_error;
-	}
-
 	iser_task->status = ISER_TASK_STATUS_STARTED;
 
-	err = iser_post_send(&iser_task->desc);
+	err = iser_post_send(iser_conn->ib_conn, tx_desc);
 	if (!err)
 		return 0;
 
 send_command_error:
-	iser_dto_buffs_release(send_dto);
 	iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err);
 	return err;
 }
@@ -428,20 +328,13 @@ int iser_send_data_out(struct iscsi_conn *conn,
 {
 	struct iscsi_iser_conn *iser_conn = conn->dd_data;
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_desc *tx_desc = NULL;
-	struct iser_dto *send_dto = NULL;
+	struct iser_tx_desc *tx_desc = NULL;
+	struct iser_regd_buf *regd_buf;
 	unsigned long buf_offset;
 	unsigned long data_seg_len;
 	uint32_t itt;
 	int err = 0;
-
-	if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-		iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-		return -EPERM;
-	}
-
-	if (iser_check_xmit(conn, task))
-		return -ENOBUFS;
+	struct ib_sge *tx_dsg;
 
 	itt = (__force uint32_t)hdr->itt;
 	data_seg_len = ntoh24(hdr->dlength);
@@ -450,28 +343,25 @@ int iser_send_data_out(struct iscsi_conn *conn,
 	iser_dbg("%s itt %d dseg_len %d offset %d\n",
 		 __func__,(int)itt,(int)data_seg_len,(int)buf_offset);
 
-	tx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO);
+	tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC);
 	if (tx_desc == NULL) {
 		iser_err("Failed to alloc desc for post dataout\n");
 		return -ENOMEM;
 	}
 
 	tx_desc->type = ISCSI_TX_DATAOUT;
+	tx_desc->iser_header.flags = ISER_VER;
 	memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
 
-	/* build the tx desc regd header and add it to the tx desc dto */
-	send_dto = &tx_desc->dto;
-	send_dto->task = iser_task;
-	iser_create_send_desc(iser_conn, tx_desc);
-
-	iser_reg_single(iser_conn->ib_conn->device,
-			send_dto->regd[0], DMA_TO_DEVICE);
+	/* build the tx desc */
+	iser_initialize_task_headers(task, tx_desc);
 
-	/* all data was registered for RDMA, we can use the lkey */
-	iser_dto_add_regd_buff(send_dto,
-			       &iser_task->rdma_regd[ISER_DIR_OUT],
-			       buf_offset,
-			       data_seg_len);
+	regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT];
+	tx_dsg = &tx_desc->tx_sg[1];
+	tx_dsg->addr    = regd_buf->reg.va + buf_offset;
+	tx_dsg->length  = data_seg_len;
+	tx_dsg->lkey    = regd_buf->reg.lkey;
+	tx_desc->num_sge = 2;
 
 	if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) {
 		iser_err("Offset:%ld & DSL:%ld in Data-Out "
@@ -485,12 +375,11 @@ int iser_send_data_out(struct iscsi_conn *conn,
 		 itt, buf_offset, data_seg_len);
 
 
-	err = iser_post_send(tx_desc);
+	err = iser_post_send(iser_conn->ib_conn, tx_desc);
 	if (!err)
 		return 0;
 
 send_data_out_error:
-	iser_dto_buffs_release(send_dto);
 	kmem_cache_free(ig.desc_cache, tx_desc);
 	iser_err("conn %p failed err %d\n",conn, err);
 	return err;
@@ -501,64 +390,44 @@ int iser_send_control(struct iscsi_conn *conn,
 {
 	struct iscsi_iser_conn *iser_conn = conn->dd_data;
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_desc *mdesc = &iser_task->desc;
-	struct iser_dto *send_dto = NULL;
+	struct iser_tx_desc *mdesc = &iser_task->desc;
 	unsigned long data_seg_len;
 	int err = 0;
-	struct iser_regd_buf *regd_buf;
 	struct iser_device *device;
-	unsigned char opcode;
-
-	if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-		iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-		return -EPERM;
-	}
-
-	if (iser_check_xmit(conn, task))
-		return -ENOBUFS;
 
 	/* build the tx desc regd header and add it to the tx desc dto */
 	mdesc->type = ISCSI_TX_CONTROL;
-	send_dto = &mdesc->dto;
-	send_dto->task = NULL;
-	iser_create_send_desc(iser_conn, mdesc);
+	iser_create_send_desc(iser_conn->ib_conn, mdesc);
 
 	device = iser_conn->ib_conn->device;
 
-	iser_reg_single(device, send_dto->regd[0], DMA_TO_DEVICE);
-
 	data_seg_len = ntoh24(task->hdr->dlength);
 
 	if (data_seg_len > 0) {
-		regd_buf = &mdesc->data_regd_buf;
-		memset(regd_buf, 0, sizeof(struct iser_regd_buf));
-		regd_buf->device = device;
-		regd_buf->virt_addr = task->data;
-		regd_buf->data_size = task->data_count;
-		iser_reg_single(device, regd_buf,
-				DMA_TO_DEVICE);
-		iser_dto_add_regd_buff(send_dto, regd_buf,
-				       0,
-				       data_seg_len);
+		struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+		if (task != conn->login_task) {
+			iser_err("data present on non login task!!!\n");
+			goto send_control_error;
+		}
+		memcpy(iser_conn->ib_conn->login_buf, task->data,
+							task->data_count);
+		tx_dsg->addr    = iser_conn->ib_conn->login_dma;
+		tx_dsg->length  = data_seg_len;
+		tx_dsg->lkey    = device->mr->lkey;
+		mdesc->num_sge = 2;
 	}
 
-	opcode = task->hdr->opcode & ISCSI_OPCODE_MASK;
-
-	/* post recv buffer for response if one is expected */
-	if (!(opcode == ISCSI_OP_NOOP_OUT && task->hdr->itt == RESERVED_ITT)) {
-		if (iser_post_receive_control(conn) != 0) {
-			iser_err("post_rcv_buff failed!\n");
-			err = -ENOMEM;
+	if (task == conn->login_task) {
+		err = iser_post_recvl(iser_conn->ib_conn);
+		if (err)
 			goto send_control_error;
-		}
 	}
 
-	err = iser_post_send(mdesc);
+	err = iser_post_send(iser_conn->ib_conn, mdesc);
 	if (!err)
 		return 0;
 
 send_control_error:
-	iser_dto_buffs_release(send_dto);
 	iser_err("conn %p failed err %d\n",conn, err);
 	return err;
 }
@@ -566,104 +435,71 @@ send_control_error:
 /**
  * iser_rcv_dto_completion - recv DTO completion
  */
-void iser_rcv_completion(struct iser_desc *rx_desc,
-			 unsigned long dto_xfer_len)
+void iser_rcv_completion(struct iser_rx_desc *rx_desc,
+			 unsigned long rx_xfer_len,
+			 struct iser_conn *ib_conn)
 {
-	struct iser_dto *dto = &rx_desc->dto;
-	struct iscsi_iser_conn *conn = dto->ib_conn->iser_conn;
-	struct iscsi_task *task;
-	struct iscsi_iser_task *iser_task;
+	struct iscsi_iser_conn *conn = ib_conn->iser_conn;
 	struct iscsi_hdr *hdr;
-	char   *rx_data = NULL;
-	int     rx_data_len = 0;
-	unsigned char opcode;
-
-	hdr = &rx_desc->iscsi_header;
+	u64 rx_dma;
+	int rx_buflen, outstanding, count, err;
+
+	/* differentiate between login to all other PDUs */
+	if ((char *)rx_desc == ib_conn->login_buf) {
+		rx_dma = ib_conn->login_dma;
+		rx_buflen = ISER_RX_LOGIN_SIZE;
+	} else {
+		rx_dma = rx_desc->dma_addr;
+		rx_buflen = ISER_RX_PAYLOAD_SIZE;
+	}
 
-	iser_dbg("op 0x%x itt 0x%x\n", hdr->opcode,hdr->itt);
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
+			rx_buflen, DMA_FROM_DEVICE);
 
-	if (dto_xfer_len > ISER_TOTAL_HEADERS_LEN) { /* we have data */
-		rx_data_len = dto_xfer_len - ISER_TOTAL_HEADERS_LEN;
-		rx_data     = dto->regd[1]->virt_addr;
-		rx_data    += dto->offset[1];
-	}
+	hdr = &rx_desc->iscsi_header;
 
-	opcode = hdr->opcode & ISCSI_OPCODE_MASK;
-
-	if (opcode == ISCSI_OP_SCSI_CMD_RSP) {
-		spin_lock(&conn->iscsi_conn->session->lock);
-		task = iscsi2_itt_to_ctask(conn->iscsi_conn, hdr->itt);
-		if (task)
-			__iscsi2_get_task(task);
-		spin_unlock(&conn->iscsi_conn->session->lock);
-
-		if (!task)
-			iser_err("itt can't be matched to task!!! "
-				 "conn %p opcode %d itt %d\n",
-				 conn->iscsi_conn, opcode, hdr->itt);
-		else {
-			iser_task = task->dd_data;
-			iser_dbg("itt %d task %p\n",hdr->itt, task);
-			iser_task->status = ISER_TASK_STATUS_COMPLETED;
-			iser_task_rdma_finalize(iser_task);
-			iscsi2_put_task(task);
-		}
-	}
-	iser_dto_buffs_release(dto);
+	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+			hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
 
-	iscsi_iser_recv(conn->iscsi_conn, hdr, rx_data, rx_data_len);
+	iscsi_iser_recv(conn->iscsi_conn, hdr,
+		rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN);
 
-	kfree(rx_desc->data);
-	kmem_cache_free(ig.desc_cache, rx_desc);
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
+			rx_buflen, DMA_FROM_DEVICE);
 
 	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
 	 * task eliminates the need to worry on tasks which are completed in   *
 	 * parallel to the execution of iser_conn_term. So the code that waits *
 	 * for the posted rx bufs refcount to become zero handles everything   */
-	atomic_dec(&conn->ib_conn->post_recv_buf_count);
+	conn->ib_conn->post_recv_buf_count--;
 
-	/*
-	 * if an unexpected PDU was received then the recv wr consumed must
-	 * be replaced, this is done in the next send of a control-type PDU
-	 */
-	if (opcode == ISCSI_OP_NOOP_IN && hdr->itt == RESERVED_ITT) {
-		/* nop-in with itt = 0xffffffff */
-		atomic_inc(&conn->ib_conn->unexpected_pdu_count);
-	}
-	else if (opcode == ISCSI_OP_ASYNC_EVENT) {
-		/* asyncronous message */
-		atomic_inc(&conn->ib_conn->unexpected_pdu_count);
+	if (rx_dma == ib_conn->login_dma)
+		return;
+
+	outstanding = ib_conn->post_recv_buf_count;
+	if (outstanding + ISER_MIN_POSTED_RX <= ISER_QP_MAX_RECV_DTOS) {
+		count = min(ISER_QP_MAX_RECV_DTOS - outstanding,
+						ISER_MIN_POSTED_RX);
+		err = iser_post_recvm(ib_conn, count);
+		if (err)
+			iser_err("posting %d rx bufs err %d\n", count, err);
 	}
-	/* a reject PDU consumes the recv buf posted for the response */
 }
 
-void iser_snd_completion(struct iser_desc *tx_desc)
+void iser_snd_completion(struct iser_tx_desc *tx_desc,
+			struct iser_conn *ib_conn)
 {
-	struct iser_dto        *dto = &tx_desc->dto;
-	struct iser_conn       *ib_conn = dto->ib_conn;
-	struct iscsi_iser_conn *iser_conn = ib_conn->iser_conn;
-	struct iscsi_conn      *conn = iser_conn->iscsi_conn;
 	struct iscsi_task *task;
-	int resume_tx = 0;
-
-	iser_dbg("Initiator, Data sent dto=0x%p\n", dto);
-
-	iser_dto_buffs_release(dto);
+	struct iser_device *device = ib_conn->device;
 
-	if (tx_desc->type == ISCSI_TX_DATAOUT)
+	if (tx_desc->type == ISCSI_TX_DATAOUT) {
+		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+					ISER_HEADERS_LEN, DMA_TO_DEVICE);
 		kmem_cache_free(ig.desc_cache, tx_desc);
-
-	if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) ==
-	    ISER_QP_MAX_REQ_DTOS)
-		resume_tx = 1;
+	}
 
 	atomic_dec(&ib_conn->post_send_buf_count);
 
-	if (resume_tx) {
-		iser_dbg("%ld resuming tx\n",jiffies);
-		iscsi_conn_queue_work(conn);
-	}
-
 	if (tx_desc->type == ISCSI_TX_CONTROL) {
 		/* this arithmetic is legal by libiscsi dd_data allocation */
 		task = (void *) ((long)(void *)tx_desc -
@@ -692,7 +528,6 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 {
-	int deferred;
 	int is_rdma_aligned = 1;
 	struct iser_regd_buf *regd;
 
@@ -710,32 +545,17 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 
 	if (iser_task->dir[ISER_DIR_IN]) {
 		regd = &iser_task->rdma_regd[ISER_DIR_IN];
-		deferred = iser_regd_buff_release(regd);
-		if (deferred) {
-			iser_err("%d references remain for BUF-IN rdma reg\n",
-				 atomic_read(&regd->ref_count));
-		}
+		if (regd->reg.is_fmr)
+			iser_unreg_mem(&regd->reg);
 	}
 
 	if (iser_task->dir[ISER_DIR_OUT]) {
 		regd = &iser_task->rdma_regd[ISER_DIR_OUT];
-		deferred = iser_regd_buff_release(regd);
-		if (deferred) {
-			iser_err("%d references remain for BUF-OUT rdma reg\n",
-				 atomic_read(&regd->ref_count));
-		}
+		if (regd->reg.is_fmr)
+			iser_unreg_mem(&regd->reg);
 	}
 
        /* if the data was unaligned, it was already unmapped and then copied */
        if (is_rdma_aligned)
 		iser_dma_unmap_task_data(iser_task);
 }
-
-void iser_dto_buffs_release(struct iser_dto *dto)
-{
-	int i;
-
-	for (i = 0; i < dto->regd_vector_len; i++)
-		iser_regd_buff_release(dto->regd[i]);
-}
-
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 274c883..fb88d68 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -41,62 +41,6 @@
 #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
 
 /**
- * Decrements the reference count for the
- * registered buffer & releases it
- *
- * returns 0 if released, 1 if deferred
- */
-int iser_regd_buff_release(struct iser_regd_buf *regd_buf)
-{
-	struct ib_device *dev;
-
-	if ((atomic_read(&regd_buf->ref_count) == 0) ||
-	    atomic_dec_and_test(&regd_buf->ref_count)) {
-		/* if we used the dma mr, unreg is just NOP */
-		if (regd_buf->reg.is_fmr)
-			iser_unreg_mem(&regd_buf->reg);
-
-		if (regd_buf->dma_addr) {
-			dev = regd_buf->device->ib_device;
-			ib_dma_unmap_single(dev,
-					 regd_buf->dma_addr,
-					 regd_buf->data_size,
-					 regd_buf->direction);
-		}
-		/* else this regd buf is associated with task which we */
-		/* dma_unmap_single/sg later */
-		return 0;
-	} else {
-		iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf);
-		return 1;
-	}
-}
-
-/**
- * iser_reg_single - fills registered buffer descriptor with
- *		     registration information
- */
-void iser_reg_single(struct iser_device *device,
-		     struct iser_regd_buf *regd_buf,
-		     enum dma_data_direction direction)
-{
-	u64 dma_addr;
-
-	dma_addr = ib_dma_map_single(device->ib_device,
-				     regd_buf->virt_addr,
-				     regd_buf->data_size, direction);
-	BUG_ON(ib_dma_mapping_error(device->ib_device, dma_addr));
-
-	regd_buf->reg.lkey = device->mr->lkey;
-	regd_buf->reg.len  = regd_buf->data_size;
-	regd_buf->reg.va   = dma_addr;
-	regd_buf->reg.is_fmr = 0;
-
-	regd_buf->dma_addr  = dma_addr;
-	regd_buf->direction = direction;
-}
-
-/**
  * iser_start_rdma_unaligned_sg
  */
 static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
@@ -109,10 +53,10 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
 	unsigned long  cmd_data_len = data->data_len;
 
 	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
-		mem = (void *)__get_free_pages(GFP_NOIO,
+		mem = (void *)__get_free_pages(GFP_ATOMIC,
 		      ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
 	else
-		mem = kmalloc(cmd_data_len, GFP_NOIO);
+		mem = kmalloc(cmd_data_len, GFP_ATOMIC);
 
 	if (mem == NULL) {
 		iser_err("Failed to allocate mem size %d %d for copying sglist\n",
@@ -474,9 +418,5 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
 			return err;
 		}
 	}
-
-	/* take a reference on this regd buf such that it will not be released *
-	 * (eg in send dto completion) before we get the scsi response         */
-	atomic_inc(&regd_buf->ref_count);
 	return 0;
 }
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 8bf97ee..87fb067 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -37,9 +37,8 @@
 #include "iscsi_iser.h"
 
 #define ISCSI_ISER_MAX_CONN	8
-#define ISER_MAX_CQ_LEN		((ISER_QP_MAX_RECV_DTOS + \
-				ISER_QP_MAX_REQ_DTOS) *   \
-				 ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_RX_CQ_LEN	(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN	(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
 
 static void iser_cq_tasklet_fn(unsigned long data);
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
@@ -54,6 +53,13 @@ static void iser_qp_event_callback(struct ib_event *cause, void *context)
 	iser_err("got qp event %d\n",cause->event);
 }
 
+static void iser_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	iser_err("async event %d on device %s port %d\n", event->event,
+		event->device->name, event->element.port_num);
+}
+
 /**
  * iser_create_device_ib_res - creates Protection Domain (PD), Completion
  * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
@@ -67,15 +73,23 @@ static int iser_create_device_ib_res(struct iser_device *device)
 	if (IS_ERR(device->pd))
 		goto pd_err;
 
-	device->cq = ib_create_cq(device->ib_device,
+	device->rx_cq = ib_create_cq(device->ib_device,
 				  iser_cq_callback,
 				  iser_cq_event_callback,
 				  (void *)device,
-				  ISER_MAX_CQ_LEN, 0);
-	if (IS_ERR(device->cq))
-		goto cq_err;
+				  ISER_MAX_RX_CQ_LEN, 0);
+	if (IS_ERR(device->rx_cq))
+		goto rx_cq_err;
+
+	device->tx_cq = ib_create_cq(device->ib_device,
+				  NULL, iser_cq_event_callback,
+				  (void *)device,
+				  ISER_MAX_TX_CQ_LEN, 0);
 
-	if (ib_req_notify_cq(device->cq, IB_CQ_NEXT_COMP))
+	if (IS_ERR(device->tx_cq))
+		goto tx_cq_err;
+
+	if (ib_req_notify_cq(device->rx_cq, IB_CQ_NEXT_COMP))
 		goto cq_arm_err;
 
 	tasklet_init(&device->cq_tasklet,
@@ -88,13 +102,22 @@ static int iser_create_device_ib_res(struct iser_device *device)
 	if (IS_ERR(device->mr))
 		goto dma_mr_err;
 
+	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
+				iser_event_handler);
+	if (ib_register_event_handler(&device->event_handler))
+		goto handler_err;
+
 	return 0;
 
+handler_err:
+	ib_dereg_mr(device->mr);
 dma_mr_err:
 	tasklet_kill(&device->cq_tasklet);
 cq_arm_err:
-	ib_destroy_cq(device->cq);
-cq_err:
+	ib_destroy_cq(device->tx_cq);
+tx_cq_err:
+	ib_destroy_cq(device->rx_cq);
+rx_cq_err:
 	ib_dealloc_pd(device->pd);
 pd_err:
 	iser_err("failed to allocate an IB resource\n");
@@ -110,13 +133,15 @@ static void iser_free_device_ib_res(struct iser_device *device)
 	BUG_ON(device->mr == NULL);
 
 	tasklet_kill(&device->cq_tasklet);
-
+	(void)ib_unregister_event_handler(&device->event_handler);
 	(void)ib_dereg_mr(device->mr);
-	(void)ib_destroy_cq(device->cq);
+	(void)ib_destroy_cq(device->tx_cq);
+	(void)ib_destroy_cq(device->rx_cq);
 	(void)ib_dealloc_pd(device->pd);
 
 	device->mr = NULL;
-	device->cq = NULL;
+	device->tx_cq = NULL;
+	device->rx_cq = NULL;
 	device->pd = NULL;
 }
 
@@ -129,13 +154,23 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 {
 	struct iser_device	*device;
 	struct ib_qp_init_attr	init_attr;
-	int			ret;
+	int			ret = -ENOMEM;
 	struct ib_fmr_pool_param params;
 
 	BUG_ON(ib_conn->device == NULL);
 
 	device = ib_conn->device;
 
+	ib_conn->login_buf = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!ib_conn->login_buf) {
+		goto alloc_err;
+		ret = -ENOMEM;
+	}
+
+	ib_conn->login_dma = ib_dma_map_single(ib_conn->device->ib_device,
+				(void *)ib_conn->login_buf, ISER_RX_LOGIN_SIZE,
+				DMA_FROM_DEVICE);
+
 	ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) +
 				    (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)),
 				    GFP_KERNEL);
@@ -169,12 +204,12 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 
 	init_attr.event_handler = iser_qp_event_callback;
 	init_attr.qp_context	= (void *)ib_conn;
-	init_attr.send_cq	= device->cq;
-	init_attr.recv_cq	= device->cq;
+	init_attr.send_cq	= device->tx_cq;
+	init_attr.recv_cq	= device->rx_cq;
 	init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
 	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
-	init_attr.cap.max_send_sge = MAX_REGD_BUF_VECTOR_LEN;
-	init_attr.cap.max_recv_sge = 2;
+	init_attr.cap.max_send_sge = 2;
+	init_attr.cap.max_recv_sge = 1;
 	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
 	init_attr.qp_type	= IB_QPT_RC;
 
@@ -192,6 +227,7 @@ qp_err:
 	(void)ib_destroy_fmr_pool(ib_conn->fmr_pool);
 fmr_pool_err:
 	kfree(ib_conn->page_vec);
+	kfree(ib_conn->login_buf);
 alloc_err:
 	iser_err("unable to alloc mem or create resource, err %d\n", ret);
 	return ret;
@@ -201,7 +237,7 @@ alloc_err:
  * releases the FMR pool, QP and CMA ID objects, returns 0 on success,
  * -1 on failure
  */
-static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
+static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id)
 {
 	BUG_ON(ib_conn == NULL);
 
@@ -216,7 +252,8 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
 	if (ib_conn->qp != NULL)
 		rdma_destroy_qp(ib_conn->cma_id);
 
-	if (ib_conn->cma_id != NULL)
+	/* if cma handler context, the caller acts s.t the cma destroy the id */
+	if (ib_conn->cma_id != NULL && can_destroy_id)
 		rdma_destroy_id(ib_conn->cma_id);
 
 	ib_conn->fmr_pool = NULL;
@@ -278,17 +315,6 @@ static void iser_device_try_release(struct iser_device *device)
 	mutex_unlock(&ig.device_list_mutex);
 }
 
-int iser_conn_state_comp(struct iser_conn *ib_conn,
-			enum iser_ib_conn_state comp)
-{
-	int ret;
-
-	spin_lock_bh(&ib_conn->lock);
-	ret = (ib_conn->state == comp);
-	spin_unlock_bh(&ib_conn->lock);
-	return ret;
-}
-
 static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
 				     enum iser_ib_conn_state comp,
 				     enum iser_ib_conn_state exch)
@@ -305,7 +331,7 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
 /**
  * Frees all conn objects and deallocs conn descriptor
  */
-static void iser_conn_release(struct iser_conn *ib_conn)
+static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id)
 {
 	struct iser_device  *device = ib_conn->device;
 
@@ -314,14 +340,12 @@ static void iser_conn_release(struct iser_conn *ib_conn)
 	mutex_lock(&ig.connlist_mutex);
 	list_del(&ib_conn->conn_list);
 	mutex_unlock(&ig.connlist_mutex);
-
-	iser_free_ib_conn_res(ib_conn);
+	iser_free_rx_descriptors(ib_conn);
+	iser_free_ib_conn_res(ib_conn, can_destroy_id);
 	ib_conn->device = NULL;
 	/* on EVENT_ADDR_ERROR there's no device yet for this conn */
 	if (device != NULL)
 		iser_device_try_release(device);
-	if (ib_conn->iser_conn)
-		ib_conn->iser_conn->ib_conn = NULL;
 	iscsi2_destroy_endpoint(ib_conn->ep);
 }
 
@@ -330,10 +354,13 @@ void iser_conn_get(struct iser_conn *ib_conn)
 	atomic_inc(&ib_conn->refcount);
 }
 
-void iser_conn_put(struct iser_conn *ib_conn)
+int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id)
 {
-	if (atomic_dec_and_test(&ib_conn->refcount))
-		iser_conn_release(ib_conn);
+	if (atomic_dec_and_test(&ib_conn->refcount)) {
+		iser_conn_release(ib_conn, can_destroy_id);
+		return 1;
+	}
+	return 0;
 }
 
 /**
@@ -357,19 +384,20 @@ void iser_conn_terminate(struct iser_conn *ib_conn)
 	wait_event_interruptible(ib_conn->wait,
 				 ib_conn->state == ISER_CONN_DOWN);
 
-	iser_conn_put(ib_conn);
+	iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */
 }
 
-static void iser_connect_error(struct rdma_cm_id *cma_id)
+static int iser_connect_error(struct rdma_cm_id *cma_id)
 {
 	struct iser_conn *ib_conn;
 	ib_conn = (struct iser_conn *)cma_id->context;
 
 	ib_conn->state = ISER_CONN_DOWN;
 	wake_up_interruptible(&ib_conn->wait);
+	return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
 }
 
-static void iser_addr_handler(struct rdma_cm_id *cma_id)
+static int iser_addr_handler(struct rdma_cm_id *cma_id)
 {
 	struct iser_device *device;
 	struct iser_conn   *ib_conn;
@@ -378,8 +406,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
 	device = iser_device_find_by_ib_device(cma_id);
 	if (!device) {
 		iser_err("device lookup/creation failed\n");
-		iser_connect_error(cma_id);
-		return;
+		return iser_connect_error(cma_id);
 	}
 
 	ib_conn = (struct iser_conn *)cma_id->context;
@@ -388,11 +415,13 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
 	ret = rdma_resolve_route(cma_id, 1000);
 	if (ret) {
 		iser_err("resolve route failed: %d\n", ret);
-		iser_connect_error(cma_id);
+		return iser_connect_error(cma_id);
 	}
+
+	return 0;
 }
 
-static void iser_route_handler(struct rdma_cm_id *cma_id)
+static int iser_route_handler(struct rdma_cm_id *cma_id)
 {
 	struct rdma_conn_param conn_param;
 	int    ret;
@@ -413,9 +442,9 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
 		goto failure;
 	}
 
-	return;
+	return 0;
 failure:
-	iser_connect_error(cma_id);
+	return iser_connect_error(cma_id);
 }
 
 static void iser_connected_handler(struct rdma_cm_id *cma_id)
@@ -427,12 +456,12 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id)
 	wake_up_interruptible(&ib_conn->wait);
 }
 
-static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
+static int iser_disconnected_handler(struct rdma_cm_id *cma_id)
 {
 	struct iser_conn *ib_conn;
+	int ret;
 
 	ib_conn = (struct iser_conn *)cma_id->context;
-	ib_conn->disc_evt_flag = 1;
 
 	/* getting here when the state is UP means that the conn is being *
 	 * terminated asynchronously from the iSCSI layer's perspective.  */
@@ -442,25 +471,29 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
 				   ISCSI_ERR_CONN_FAILED);
 
 	/* Complete the termination process if no posts are pending */
-	if ((atomic_read(&ib_conn->post_recv_buf_count) == 0) &&
+	if (ib_conn->post_recv_buf_count == 0 &&
 	    (atomic_read(&ib_conn->post_send_buf_count) == 0)) {
 		ib_conn->state = ISER_CONN_DOWN;
 		wake_up_interruptible(&ib_conn->wait);
 	}
+
+	ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
+	return ret;
 }
 
 static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 {
 	int ret = 0;
 
-	iser_err("event %d conn %p id %p\n",event->event,cma_id->context,cma_id);
+	iser_err("event %d status %d conn %p id %p\n",
+		event->event, event->status, cma_id->context, cma_id);
 
 	switch (event->event) {
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
-		iser_addr_handler(cma_id);
+		ret = iser_addr_handler(cma_id);
 		break;
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
-		iser_route_handler(cma_id);
+		ret = iser_route_handler(cma_id);
 		break;
 	case RDMA_CM_EVENT_ESTABLISHED:
 		iser_connected_handler(cma_id);
@@ -470,13 +503,12 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 	case RDMA_CM_EVENT_UNREACHABLE:
 	case RDMA_CM_EVENT_REJECTED:
-		iser_err("event: %d, error: %d\n", event->event, event->status);
-		iser_connect_error(cma_id);
+		ret = iser_connect_error(cma_id);
 		break;
 	case RDMA_CM_EVENT_DISCONNECTED:
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 	case RDMA_CM_EVENT_ADDR_CHANGE:
-		iser_disconnected_handler(cma_id);
+		ret = iser_disconnected_handler(cma_id);
 		break;
 	default:
 		iser_err("Unexpected RDMA CM event (%d)\n", event->event);
@@ -489,10 +521,9 @@ void iser_conn_init(struct iser_conn *ib_conn)
 {
 	ib_conn->state = ISER_CONN_INIT;
 	init_waitqueue_head(&ib_conn->wait);
-	atomic_set(&ib_conn->post_recv_buf_count, 0);
+	ib_conn->post_recv_buf_count = 0;
 	atomic_set(&ib_conn->post_send_buf_count, 0);
-	atomic_set(&ib_conn->unexpected_pdu_count, 0);
-	atomic_set(&ib_conn->refcount, 1);
+	atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */
 	INIT_LIST_HEAD(&ib_conn->conn_list);
 	spin_lock_init(&ib_conn->lock);
 }
@@ -520,6 +551,7 @@ int iser_connect(struct iser_conn   *ib_conn,
 
 	ib_conn->state = ISER_CONN_PENDING;
 
+	iser_conn_get(ib_conn); /* ref ib conn's cma id */
 	ib_conn->cma_id = rdma_create_id(iser_cma_handler,
 					     (void *)ib_conn,
 					     RDMA_PS_TCP);
@@ -557,7 +589,7 @@ id_failure:
 addr_failure:
 	ib_conn->state = ISER_CONN_DOWN;
 connect_failure:
-	iser_conn_release(ib_conn);
+	iser_conn_release(ib_conn, 1);
 	return err;
 }
 
@@ -626,136 +658,97 @@ void iser_unreg_mem(struct iser_mem_reg *reg)
 	reg->mem_h = NULL;
 }
 
-/**
- * iser_dto_to_iov - builds IOV from a dto descriptor
- */
-static void iser_dto_to_iov(struct iser_dto *dto, struct ib_sge *iov, int iov_len)
+int iser_post_recvl(struct iser_conn *ib_conn)
 {
-	int		     i;
-	struct ib_sge	     *sge;
-	struct iser_regd_buf *regd_buf;
-
-	if (dto->regd_vector_len > iov_len) {
-		iser_err("iov size %d too small for posting dto of len %d\n",
-			 iov_len, dto->regd_vector_len);
-		BUG();
-	}
+	struct ib_recv_wr rx_wr, *rx_wr_failed;
+	struct ib_sge	  sge;
+	int ib_ret;
 
-	for (i = 0; i < dto->regd_vector_len; i++) {
-		sge	    = &iov[i];
-		regd_buf  = dto->regd[i];
-
-		sge->addr   = regd_buf->reg.va;
-		sge->length = regd_buf->reg.len;
-		sge->lkey   = regd_buf->reg.lkey;
-
-		if (dto->used_sz[i] > 0)  /* Adjust size */
-			sge->length = dto->used_sz[i];
-
-		/* offset and length should not exceed the regd buf length */
-		if (sge->length + dto->offset[i] > regd_buf->reg.len) {
-			iser_err("Used len:%ld + offset:%d, exceed reg.buf.len:"
-				 "%ld in dto:0x%p [%d], va:0x%08lX\n",
-				 (unsigned long)sge->length, dto->offset[i],
-				 (unsigned long)regd_buf->reg.len, dto, i,
-				 (unsigned long)sge->addr);
-			BUG();
-		}
+	sge.addr   = ib_conn->login_dma;
+	sge.length = ISER_RX_LOGIN_SIZE;
+	sge.lkey   = ib_conn->device->mr->lkey;
 
-		sge->addr += dto->offset[i]; /* Adjust offset */
+	rx_wr.wr_id   = (unsigned long)ib_conn->login_buf;
+	rx_wr.sg_list = &sge;
+	rx_wr.num_sge = 1;
+	rx_wr.next    = NULL;
+
+	ib_conn->post_recv_buf_count++;
+	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
+	if (ib_ret) {
+		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+		ib_conn->post_recv_buf_count--;
 	}
+	return ib_ret;
 }
 
-/**
- * iser_post_recv - Posts a receive buffer.
- *
- * returns 0 on success, -1 on failure
- */
-int iser_post_recv(struct iser_desc *rx_desc)
+int iser_post_recvm(struct iser_conn *ib_conn, int count)
 {
-	int		  ib_ret, ret_val = 0;
-	struct ib_recv_wr recv_wr, *recv_wr_failed;
-	struct ib_sge	  iov[2];
-	struct iser_conn  *ib_conn;
-	struct iser_dto   *recv_dto = &rx_desc->dto;
-
-	/* Retrieve conn */
-	ib_conn = recv_dto->ib_conn;
-
-	iser_dto_to_iov(recv_dto, iov, 2);
+	struct ib_recv_wr *rx_wr, *rx_wr_failed;
+	int i, ib_ret;
+	unsigned int my_rx_head = ib_conn->rx_desc_head;
+	struct iser_rx_desc *rx_desc;
+
+	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
+		rx_desc		= &ib_conn->rx_descs[my_rx_head];
+		rx_wr->wr_id	= (unsigned long)rx_desc;
+		rx_wr->sg_list	= &rx_desc->rx_sg;
+		rx_wr->num_sge	= 1;
+		rx_wr->next	= rx_wr + 1;
+		my_rx_head = (my_rx_head + 1) & (ISER_QP_MAX_RECV_DTOS - 1);
+	}
 
-	recv_wr.next	= NULL;
-	recv_wr.sg_list = iov;
-	recv_wr.num_sge = recv_dto->regd_vector_len;
-	recv_wr.wr_id	= (unsigned long)rx_desc;
+	rx_wr--;
+	rx_wr->next = NULL; /* mark end of work requests list */
 
-	atomic_inc(&ib_conn->post_recv_buf_count);
-	ib_ret	= ib_post_recv(ib_conn->qp, &recv_wr, &recv_wr_failed);
+	ib_conn->post_recv_buf_count += count;
+	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
 	if (ib_ret) {
 		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
-		atomic_dec(&ib_conn->post_recv_buf_count);
-		ret_val = -1;
-	}
-
-	return ret_val;
+		ib_conn->post_recv_buf_count -= count;
+	} else
+		ib_conn->rx_desc_head = my_rx_head;
+	return ib_ret;
 }
 
+
 /**
  * iser_start_send - Initiate a Send DTO operation
  *
  * returns 0 on success, -1 on failure
  */
-int iser_post_send(struct iser_desc *tx_desc)
+int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc)
 {
-	int		  ib_ret, ret_val = 0;
+	int		  ib_ret;
 	struct ib_send_wr send_wr, *send_wr_failed;
-	struct ib_sge	  iov[MAX_REGD_BUF_VECTOR_LEN];
-	struct iser_conn  *ib_conn;
-	struct iser_dto   *dto = &tx_desc->dto;
 
-	ib_conn = dto->ib_conn;
-
-	iser_dto_to_iov(dto, iov, MAX_REGD_BUF_VECTOR_LEN);
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
 	send_wr.next	   = NULL;
 	send_wr.wr_id	   = (unsigned long)tx_desc;
-	send_wr.sg_list	   = iov;
-	send_wr.num_sge	   = dto->regd_vector_len;
+	send_wr.sg_list	   = tx_desc->tx_sg;
+	send_wr.num_sge	   = tx_desc->num_sge;
 	send_wr.opcode	   = IB_WR_SEND;
-	send_wr.send_flags = dto->notify_enable ? IB_SEND_SIGNALED : 0;
+	send_wr.send_flags = IB_SEND_SIGNALED;
 
 	atomic_inc(&ib_conn->post_send_buf_count);
 
 	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
 	if (ib_ret) {
-		iser_err("Failed to start SEND DTO, dto: 0x%p, IOV len: %d\n",
-			 dto, dto->regd_vector_len);
 		iser_err("ib_post_send failed, ret:%d\n", ib_ret);
 		atomic_dec(&ib_conn->post_send_buf_count);
-		ret_val = -1;
 	}
-
-	return ret_val;
+	return ib_ret;
 }
 
-static void iser_handle_comp_error(struct iser_desc *desc)
+static void iser_handle_comp_error(struct iser_tx_desc *desc,
+				struct iser_conn *ib_conn)
 {
-	struct iser_dto  *dto     = &desc->dto;
-	struct iser_conn *ib_conn = dto->ib_conn;
-
-	iser_dto_buffs_release(dto);
-
-	if (desc->type == ISCSI_RX) {
-		kfree(desc->data);
+	if (desc && desc->type == ISCSI_TX_DATAOUT)
 		kmem_cache_free(ig.desc_cache, desc);
-		atomic_dec(&ib_conn->post_recv_buf_count);
-	} else { /* type is TX control/command/dataout */
-		if (desc->type == ISCSI_TX_DATAOUT)
-			kmem_cache_free(ig.desc_cache, desc);
-		atomic_dec(&ib_conn->post_send_buf_count);
-	}
 
-	if (atomic_read(&ib_conn->post_recv_buf_count) == 0 &&
+	if (ib_conn->post_recv_buf_count == 0 &&
 	    atomic_read(&ib_conn->post_send_buf_count) == 0) {
 		/* getting here when the state is UP means that the conn is *
 		 * being terminated asynchronously from the iSCSI layer's   *
@@ -765,41 +758,81 @@ static void iser_handle_comp_error(struct iser_desc *desc)
 			iscsi2_conn_failure(ib_conn->iser_conn->iscsi_conn,
 					   ISCSI_ERR_CONN_FAILED);
 
-		/* complete the termination process if disconnect event was delivered *
-		 * note there are no more non completed posts to the QP               */
-		if (ib_conn->disc_evt_flag) {
-			ib_conn->state = ISER_CONN_DOWN;
-			wake_up_interruptible(&ib_conn->wait);
+		/* no more non completed posts to the QP, complete the
+		 * termination process w.o worrying on disconnect event */
+		ib_conn->state = ISER_CONN_DOWN;
+		wake_up_interruptible(&ib_conn->wait);
+	}
+}
+
+static int iser_drain_tx_cq(struct iser_device  *device)
+{
+	struct ib_cq  *cq = device->tx_cq;
+	struct ib_wc  wc;
+	struct iser_tx_desc *tx_desc;
+	struct iser_conn *ib_conn;
+	int completed_tx = 0;
+
+	while (ib_poll_cq(cq, 1, &wc) == 1) {
+		tx_desc	= (struct iser_tx_desc *) (unsigned long) wc.wr_id;
+		ib_conn = wc.qp->qp_context;
+		if (wc.status == IB_WC_SUCCESS) {
+			if (wc.opcode == IB_WC_SEND)
+				iser_snd_completion(tx_desc, ib_conn);
+			else
+				iser_err("expected opcode %d got %d\n",
+					IB_WC_SEND, wc.opcode);
+		} else {
+			iser_err("tx id %llx status %d vend_err %x\n",
+				wc.wr_id, wc.status, wc.vendor_err);
+			atomic_dec(&ib_conn->post_send_buf_count);
+			iser_handle_comp_error(tx_desc, ib_conn);
 		}
+		completed_tx++;
 	}
+	return completed_tx;
 }
 
+
 static void iser_cq_tasklet_fn(unsigned long data)
 {
 	 struct iser_device  *device = (struct iser_device *)data;
-	 struct ib_cq	     *cq = device->cq;
+	 struct ib_cq	     *cq = device->rx_cq;
 	 struct ib_wc	     wc;
-	 struct iser_desc    *desc;
+	 struct iser_rx_desc *desc;
 	 unsigned long	     xfer_len;
+	struct iser_conn *ib_conn;
+	int completed_tx, completed_rx;
+	completed_tx = completed_rx = 0;
 
 	while (ib_poll_cq(cq, 1, &wc) == 1) {
-		desc	 = (struct iser_desc *) (unsigned long) wc.wr_id;
+		desc	 = (struct iser_rx_desc *) (unsigned long) wc.wr_id;
 		BUG_ON(desc == NULL);
-
+		ib_conn = wc.qp->qp_context;
 		if (wc.status == IB_WC_SUCCESS) {
-			if (desc->type == ISCSI_RX) {
+			if (wc.opcode == IB_WC_RECV) {
 				xfer_len = (unsigned long)wc.byte_len;
-				iser_rcv_completion(desc, xfer_len);
-			} else /* type == ISCSI_TX_CONTROL/SCSI_CMD/DOUT */
-				iser_snd_completion(desc);
+				iser_rcv_completion(desc, xfer_len, ib_conn);
+			} else
+				iser_err("expected opcode %d got %d\n",
+					IB_WC_RECV, wc.opcode);
 		} else {
-			iser_err("comp w. error op %d status %d\n",desc->type,wc.status);
-			iser_handle_comp_error(desc);
+			if (wc.status != IB_WC_WR_FLUSH_ERR)
+				iser_err("rx id %llx status %d vend_err %x\n",
+					wc.wr_id, wc.status, wc.vendor_err);
+			ib_conn->post_recv_buf_count--;
+			iser_handle_comp_error(NULL, ib_conn);
 		}
+		completed_rx++;
+		if (!(completed_rx & 63))
+			completed_tx += iser_drain_tx_cq(device);
 	}
 	/* #warning "it is assumed here that arming CQ only once its empty" *
 	 * " would not cause interrupts to be missed"                       */
 	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+	completed_tx += iser_drain_tx_cq(device);
+	iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx);
 }
 
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context)