Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 1588

kernel-2.6.18-238.el5.src.rpm

From: Doug Ledford <dledford@redhat.com>
Date: Mon, 15 Jun 2009 09:52:45 -0400
Subject: [infiniband] RDS: Update to ofed 1.4.1 final bits
Message-id: 7ecbc9e071f8791cb81f38f35591bbb05ad208fc.1245072810.git.dledford@redhat.com
O-Subject: [Patch RHEL5.4 10/16] [RDS] Update to ofed 1.4.1 final bits
Bugzilla: 506097

Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/net/rds/connection.c b/net/rds/connection.c
index ab68faf..a26df1c 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -157,7 +157,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	memset(conn, 0, sizeof(*conn));
 
 	INIT_HLIST_NODE(&conn->c_hash_node);
-	conn->c_version = RDS_PROTOCOL_3_0;
 	conn->c_laddr = laddr;
 	conn->c_faddr = faddr;
 	spin_lock_init(&conn->c_lock);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index d0e24d9..dd99b44 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -43,11 +43,14 @@
 
 unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
 unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
 
 module_param(fmr_pool_size, int, 0444);
 MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
 module_param(fmr_message_size, int, 0444);
 MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+module_param(rds_ib_retry_count, int, 0444);
+MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
 
 struct list_head rds_ib_devices;
 
@@ -224,8 +227,8 @@ static int rds_ib_laddr_check(__be32 addr)
 	 * IB and iWARP capable NICs.
 	 */
 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
-	if (!cm_id)
-		return -EADDRNOTAVAIL;
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_family = AF_INET;
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 5b5d41b..1aca6d5 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -15,6 +15,8 @@
 #define RDS_IB_DEFAULT_RECV_WR		1024
 #define RDS_IB_DEFAULT_SEND_WR		256
 
+#define RDS_IB_DEFAULT_RETRY_COUNT	2
+
 #define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
 
 extern struct list_head rds_ib_devices;
@@ -247,6 +249,7 @@ extern struct ib_client rds_ib_client;
 
 extern unsigned int fmr_pool_size;
 extern unsigned int fmr_message_size;
+extern unsigned int rds_ib_retry_count;
 
 extern spinlock_t ib_nodev_conns_lock;
 extern struct list_head ib_nodev_conns;
@@ -355,17 +358,25 @@ extern ctl_table rds_ib_sysctl_table[];
 /*
  * Helper functions for getting/setting the header and data SGEs in
  * RDS packets (not RDMA)
+ *
+ * From version 3.1 onwards, header is in front of data in the sge.
  */
 static inline struct ib_sge *
 rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
 {
-	return &sge[0];
+	if (ic->conn->c_version > RDS_PROTOCOL_3_0)
+		return &sge[0];
+	else
+		return &sge[1];
 }
 
 static inline struct ib_sge *
 rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
 {
-	return &sge[1];
+	if (ic->conn->c_version > RDS_PROTOCOL_3_0)
+		return &sge[1];
+	else
+		return &sge[0];
 }
 
 #endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 73285e6..b376bd1 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -98,21 +98,34 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	struct ib_qp_attr qp_attr;
 	int err;
 
-	if (event->param.conn.private_data_len) {
+	if (event->param.conn.private_data_len >= sizeof(*dp)) {
 		dp = event->param.conn.private_data;
 
-		rds_ib_set_protocol(conn,
+		/* make sure it isn't empty data */
+		if (dp->dp_protocol_major) {
+			rds_ib_set_protocol(conn,
 				RDS_PROTOCOL(dp->dp_protocol_major,
-					dp->dp_protocol_minor));
-		rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+				dp->dp_protocol_minor));
+			rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+		}
 	}
 
 	printk(KERN_NOTICE "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s\n",
-			NIPQUAD(conn->c_laddr),
+			NIPQUAD(conn->c_faddr),
 			RDS_PROTOCOL_MAJOR(conn->c_version),
 			RDS_PROTOCOL_MINOR(conn->c_version),
 			ic->i_flowctl ? ", flow control" : "");
 
+	/*
+	 * Init rings and fill recv. this needs to wait until protocol negotiation
+	 * is complete, since ring layout is different from 3.0 to 3.1.
+	 */
+	rds_ib_send_init_ring(ic);
+	rds_ib_recv_init_ring(ic);
+	/* Post receive buffers - as a side effect, this will update
+	 * the posted credit count. */
+	rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
 	/* Tune RNR behavior */
 	rds_ib_tune_rnr(ic, &qp_attr);
 
@@ -145,7 +158,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 	/* XXX tune these? */
 	conn_param->responder_resources = 1;
 	conn_param->initiator_depth = 1;
-	conn_param->retry_count = 7;
+	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
 	conn_param->rnr_retry_count = 7;
 
 	if (dp) {
@@ -190,9 +203,10 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
 		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
 		break;
 	default:
-		printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
-		       "on connection to %u.%u.%u.%u\n", event->event,
-		       NIPQUAD(conn->c_faddr));
+		rds_ib_conn_error(conn, "RDS/IB: Fatal QP Event %u "
+			"- connection %u.%u.%u.%u->%u.%u.%u.%u...reconnecting\n",
+			event->event, NIPQUAD(conn->c_laddr),
+			NIPQUAD(conn->c_faddr));
 		break;
 	}
 }
@@ -321,7 +335,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 		rdsdebug("send allocation failed\n");
 		goto out;
 	}
-	rds_ib_send_init_ring(ic);
+	memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
 
 	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
 	if (ic->i_recvs == NULL) {
@@ -329,14 +343,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 		rdsdebug("recv allocation failed\n");
 		goto out;
 	}
+	memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
 
-	rds_ib_recv_init_ring(ic);
 	rds_ib_recv_init_ack(ic);
 
-	/* Post receive buffers - as a side effect, this will update
-	 * the posted credit count. */
-	rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
-
 	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
 		 ic->i_send_cq, ic->i_recv_cq);
 
@@ -344,19 +354,32 @@ out:
 	return ret;
 }
 
-static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
 {
+	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
 	u16 common;
 	u32 version = 0;
 
-	/* rdma_cm private data is odd - when there is any private data in the
+	/*
+	 * rdma_cm private data is odd - when there is any private data in the
 	 * request, we will be given a pretty large buffer without telling us the
 	 * original size. The only way to tell the difference is by looking at
 	 * the contents, which are initialized to zero.
 	 * If the protocol version fields aren't set, this is a connection attempt
 	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
-	 * We really should have changed this for OFED 1.3 :-( */
-	if (dp->dp_protocol_major == 0)
+	 * We really should have changed this for OFED 1.3 :-(
+	 */
+
+	/* Be paranoid. RDS always has privdata */
+	if (!event->param.conn.private_data_len) {
+		printk(KERN_NOTICE "RDS incoming connection has no private data, "
+			"rejecting\n");
+		return 0;
+	}
+
+	/* Even if len is crap *now* I still want to check it. -ASG */
+	if (event->param.conn.private_data_len < sizeof (*dp)
+	    || dp->dp_protocol_major == 0)
 		return RDS_PROTOCOL_3_0;
 
 	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
@@ -388,7 +411,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 	int err, destroy = 1;
 
 	/* Check whether the remote protocol version matches ours. */
-	version = rds_ib_protocol_compatible(dp);
+	version = rds_ib_protocol_compatible(event);
 	if (!version)
 		goto out;
 
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index a57c2fa..29ec8be 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -555,6 +555,47 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
 	return rds_ib_get_ack(ic);
 }
 
+static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
+					    struct rds_ib_recv_work *recv,
+					    u32 data_len)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
+	void *addr;
+	u32 misplaced_hdr_bytes;
+
+	/*
+	 * Support header at the front (RDS 3.1+) as well as header-at-end.
+	 *
+	 * Cases:
+	 * 1) header all in header buff (great!)
+	 * 2) header all in data page (copy all to header buff)
+	 * 3) header split across hdr buf + data page
+	 *    (move bit in hdr buff to end before copying other bit from data page)
+	 */
+	if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
+	        return hdr_buff;
+
+	if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
+		addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
+		memcpy(hdr_buff,
+		       addr + recv->r_frag->f_offset + data_len,
+		       sizeof(struct rds_header));
+		kunmap_atomic(addr, KM_SOFTIRQ0);
+		return hdr_buff;
+	}
+
+	misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
+
+	memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
+
+	addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
+	memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
+	       sizeof(struct rds_header) - misplaced_hdr_bytes);
+	kunmap_atomic(addr, KM_SOFTIRQ0);
+	return hdr_buff;
+}
+
 /*
  * It's kind of lame that we're copying from the posted receive pages into
  * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
@@ -645,7 +686,7 @@ struct rds_ib_ack_state {
 };
 
 static void rds_ib_process_recv(struct rds_connection *conn,
-				struct rds_ib_recv_work *recv, u32 byte_len,
+				struct rds_ib_recv_work *recv, u32 data_len,
 				struct rds_ib_ack_state *state)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
@@ -655,9 +696,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 	/* XXX shut down the connection if port 0,0 are seen? */
 
 	rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
-		 byte_len);
+		 data_len);
 
-	if (byte_len < sizeof(struct rds_header)) {
+	if (data_len < sizeof(struct rds_header)) {
 		rds_ib_conn_error(conn, "incoming message "
 		       "from %u.%u.%u.%u didn't inclue a "
 		       "header, disconnecting and "
@@ -665,9 +706,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		       NIPQUAD(conn->c_faddr));
 		return;
 	}
-	byte_len -= sizeof(struct rds_header);
+	data_len -= sizeof(struct rds_header);
 
-	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+	ihdr = rds_ib_get_header(conn, recv, data_len);
 
 	/* Validate the checksum. */
 	if (!rds_message_verify_checksum(ihdr)) {
@@ -687,7 +728,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 	if (ihdr->h_credit)
 		rds_ib_send_add_credits(conn, ihdr->h_credit);
 
-	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
 		/* This is an ACK-only packet. The fact that it gets
 		 * special treatment here is that historically, ACKs
 		 * were rather special beasts.
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index d87830d..84b5ffc 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -53,7 +53,17 @@ unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
 static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
 static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
 
-unsigned int rds_ib_sysctl_flow_control = 1;
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rds_ib_sysctl_flow_control = 0;
 
 ctl_table rds_ib_sysctl_table[] = {
 	{
diff --git a/net/rds/iw.c b/net/rds/iw.c
index ba80245..23f6e31 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -233,8 +233,8 @@ static int rds_iw_laddr_check(__be32 addr)
 	 * IB and iWARP capable NICs.
 	 */
 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
-	if (!cm_id)
-		return -EADDRNOTAVAIL;
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_family = AF_INET;
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 979175f..418f7d5 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -101,7 +101,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 		break;
 
 	case RDMA_CM_EVENT_DISCONNECTED:
-		printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
+		printk(KERN_WARNING "RDS/RDMA: DISCONNECT event - dropping connection "
 			"%u.%u.%u.%u->%u.%u.%u.%u\n", NIPQUAD(conn->c_laddr),
 			 NIPQUAD(conn->c_faddr));
 		rds_conn_drop(conn);
@@ -138,7 +138,7 @@ static int __init rds_rdma_listen_init(void)
 	cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
 	if (IS_ERR(cm_id)) {
 		ret = PTR_ERR(cm_id);
-		printk(KERN_ERR "RDS/IW: failed to setup listener, "
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
 		       "rdma_create_id() returned %d\n", ret);
 		goto out;
 	}
@@ -153,14 +153,14 @@ static int __init rds_rdma_listen_init(void)
 	 */
 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
 	if (ret) {
-		printk(KERN_ERR "RDS/IW: failed to setup listener, "
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
 		       "rdma_bind_addr() returned %d\n", ret);
 		goto out;
 	}
 
 	ret = rdma_listen(cm_id, 128);
 	if (ret) {
-		printk(KERN_ERR "RDS/IW: failed to setup listener, "
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
 		       "rdma_listen() returned %d\n", ret);
 		goto out;
 	}