From: Doug Ledford <dledford@redhat.com> Date: Mon, 15 Jun 2009 09:52:45 -0400 Subject: [infiniband] RDS: Update to ofed 1.4.1 final bits Message-id: 7ecbc9e071f8791cb81f38f35591bbb05ad208fc.1245072810.git.dledford@redhat.com O-Subject: [Patch RHEL5.4 10/16] [RDS] Update to ofed 1.4.1 final bits Bugzilla: 506097 Signed-off-by: Doug Ledford <dledford@redhat.com> diff --git a/net/rds/connection.c b/net/rds/connection.c index ab68faf..a26df1c 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -157,7 +157,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, memset(conn, 0, sizeof(*conn)); INIT_HLIST_NODE(&conn->c_hash_node); - conn->c_version = RDS_PROTOCOL_3_0; conn->c_laddr = laddr; conn->c_faddr = faddr; spin_lock_init(&conn->c_lock); diff --git a/net/rds/ib.c b/net/rds/ib.c index d0e24d9..dd99b44 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -43,11 +43,14 @@ unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; module_param(fmr_pool_size, int, 0444); MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); module_param(fmr_message_size, int, 0444); MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +module_param(rds_ib_retry_count, int, 0444); +MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); struct list_head rds_ib_devices; @@ -224,8 +227,8 @@ static int rds_ib_laddr_check(__be32 addr) * IB and iWARP capable NICs. */ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); - if (!cm_id) - return -EADDRNOTAVAIL; + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; diff --git a/net/rds/ib.h b/net/rds/ib.h index 5b5d41b..1aca6d5 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -15,6 +15,8 @@ #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 +#define RDS_IB_DEFAULT_RETRY_COUNT 2 + #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ extern struct list_head rds_ib_devices; @@ -247,6 +249,7 @@ extern struct ib_client rds_ib_client; extern unsigned int fmr_pool_size; extern unsigned int fmr_message_size; +extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; extern struct list_head ib_nodev_conns; @@ -355,17 +358,25 @@ extern ctl_table rds_ib_sysctl_table[]; /* * Helper functions for getting/setting the header and data SGEs in * RDS packets (not RDMA) + * + * From version 3.1 onwards, header is in front of data in the sge. */ static inline struct ib_sge * rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) { - return &sge[0]; + if (ic->conn->c_version > RDS_PROTOCOL_3_0) + return &sge[0]; + else + return &sge[1]; } static inline struct ib_sge * rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) { - return &sge[1]; + if (ic->conn->c_version > RDS_PROTOCOL_3_0) + return &sge[1]; + else + return &sge[0]; } #endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 73285e6..b376bd1 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -98,21 +98,34 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even struct ib_qp_attr qp_attr; int err; - if (event->param.conn.private_data_len) { + if (event->param.conn.private_data_len >= sizeof(*dp)) { dp = event->param.conn.private_data; - rds_ib_set_protocol(conn, + /* make sure it isn't empty data */ + if (dp->dp_protocol_major) { + rds_ib_set_protocol(conn, RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + dp->dp_protocol_minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } } printk(KERN_NOTICE "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s\n", - NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr), RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version), ic->i_flowctl ? ", flow control" : ""); + /* + * Init rings and fill recv. this needs to wait until protocol negotiation + * is complete, since ring layout is different from 3.0 to 3.1. + */ + rds_ib_send_init_ring(ic); + rds_ib_recv_init_ring(ic); + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -145,7 +158,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, /* XXX tune these? */ conn_param->responder_resources = 1; conn_param->initiator_depth = 1; - conn_param->retry_count = 7; + conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); conn_param->rnr_retry_count = 7; if (dp) { @@ -190,9 +203,10 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: - printk(KERN_WARNING "RDS/ib: unhandled QP event %u " - "on connection to %u.%u.%u.%u\n", event->event, - NIPQUAD(conn->c_faddr)); + rds_ib_conn_error(conn, "RDS/IB: Fatal QP Event %u " + "- connection %u.%u.%u.%u->%u.%u.%u.%u...reconnecting\n", + event->event, NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr)); break; } } @@ -321,7 +335,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rdsdebug("send allocation failed\n"); goto out; } - rds_ib_send_init_ring(ic); + memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); if (ic->i_recvs == NULL) { @@ -329,14 +343,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rdsdebug("recv allocation failed\n"); goto out; } + memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); - rds_ib_recv_init_ring(ic); rds_ib_recv_init_ack(ic); - /* Post receive buffers - as a side effect, this will update - * the posted credit count. */ - rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); @@ -344,19 +354,32 @@ out: return ret; } -static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp) +static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) { + const struct rds_ib_connect_private *dp = event->param.conn.private_data; u16 common; u32 version = 0; - /* rdma_cm private data is odd - when there is any private data in the + /* + * rdma_cm private data is odd - when there is any private data in the * request, we will be given a pretty large buffer without telling us the * original size. The only way to tell the difference is by looking at * the contents, which are initialized to zero. * If the protocol version fields aren't set, this is a connection attempt * from an older version. This could could be 3.0 or 2.0 - we can't tell. - * We really should have changed this for OFED 1.3 :-( */ - if (dp->dp_protocol_major == 0) + * We really should have changed this for OFED 1.3 :-( + */ + + /* Be paranoid. RDS always has privdata */ + if (!event->param.conn.private_data_len) { + printk(KERN_NOTICE "RDS incoming connection has no private data, " + "rejecting\n"); + return 0; + } + + /* Even if len is crap *now* I still want to check it. -ASG */ + if (event->param.conn.private_data_len < sizeof (*dp) + || dp->dp_protocol_major == 0) return RDS_PROTOCOL_3_0; common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; @@ -388,7 +411,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int err, destroy = 1; /* Check whether the remote protocol version matches ours. */ - version = rds_ib_protocol_compatible(dp); + version = rds_ib_protocol_compatible(event); if (!version) goto out; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index a57c2fa..29ec8be 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -555,6 +555,47 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) return rds_ib_get_ack(ic); } +static struct rds_header *rds_ib_get_header(struct rds_connection *conn, + struct rds_ib_recv_work *recv, + u32 data_len) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; + void *addr; + u32 misplaced_hdr_bytes; + + /* + * Support header at the front (RDS 3.1+) as well as header-at-end. + * + * Cases: + * 1) header all in header buff (great!) + * 2) header all in data page (copy all to header buff) + * 3) header split across hdr buf + data page + * (move bit in hdr buff to end before copying other bit from data page) + */ + if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) + return hdr_buff; + + if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { + addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); + memcpy(hdr_buff, + addr + recv->r_frag->f_offset + data_len, + sizeof(struct rds_header)); + kunmap_atomic(addr, KM_SOFTIRQ0); + return hdr_buff; + } + + misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); + + memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); + + addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); + memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, + sizeof(struct rds_header) - misplaced_hdr_bytes); + kunmap_atomic(addr, KM_SOFTIRQ0); + return hdr_buff; +} + /* * It's kind of lame that we're copying from the posted receive pages into * long-lived bitmaps. We could have posted the bitmaps and rdma written into @@ -645,7 +686,7 @@ struct rds_ib_ack_state { }; static void rds_ib_process_recv(struct rds_connection *conn, - struct rds_ib_recv_work *recv, u32 byte_len, + struct rds_ib_recv_work *recv, u32 data_len, struct rds_ib_ack_state *state) { struct rds_ib_connection *ic = conn->c_transport_data; @@ -655,9 +696,9 @@ static void rds_ib_process_recv(struct rds_connection *conn, /* XXX shut down the connection if port 0,0 are seen? */ rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, - byte_len); + data_len); - if (byte_len < sizeof(struct rds_header)) { + if (data_len < sizeof(struct rds_header)) { rds_ib_conn_error(conn, "incoming message " "from %u.%u.%u.%u didn't inclue a " "header, disconnecting and " @@ -665,9 +706,9 @@ static void rds_ib_process_recv(struct rds_connection *conn, NIPQUAD(conn->c_faddr)); return; } - byte_len -= sizeof(struct rds_header); + data_len -= sizeof(struct rds_header); - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + ihdr = rds_ib_get_header(conn, recv, data_len); /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { @@ -687,7 +728,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, if (ihdr->h_credit) rds_ib_send_add_credits(conn, ihdr->h_credit); - if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { /* This is an ACK-only packet. The fact that it gets * special treatment here is that historically, ACKs * were rather special beasts. diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index d87830d..84b5ffc 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -53,7 +53,17 @@ unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; -unsigned int rds_ib_sysctl_flow_control = 1; +/* + * This sysctl does nothing. + * + * Backwards compatibility with RDS 3.0 wire protocol + * disables initial FC credit exchange. + * If it's ever possible to drop 3.0 support, + * setting this to 1 and moving init/refill of send/recv + * rings from ib_cm_connect_complete() back into ib_setup_qp() + * will cause credits to be added before protocol negotiation. + */ +unsigned int rds_ib_sysctl_flow_control = 0; ctl_table rds_ib_sysctl_table[] = { { diff --git a/net/rds/iw.c b/net/rds/iw.c index ba80245..23f6e31 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -233,8 +233,8 @@ static int rds_iw_laddr_check(__be32 addr) * IB and iWARP capable NICs. */ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); - if (!cm_id) - return -EADDRNOTAVAIL; + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 979175f..418f7d5 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -101,7 +101,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_DISCONNECTED: - printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection " + printk(KERN_WARNING "RDS/RDMA: DISCONNECT event - dropping connection " "%u.%u.%u.%u->%u.%u.%u.%u\n", NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); rds_conn_drop(conn); @@ -138,7 +138,7 @@ static int __init rds_rdma_listen_init(void) cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); - printk(KERN_ERR "RDS/IW: failed to setup listener, " + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_create_id() returned %d\n", ret); goto out; } @@ -153,14 +153,14 @@ static int __init rds_rdma_listen_init(void) */ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); if (ret) { - printk(KERN_ERR "RDS/IW: failed to setup listener, " + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_bind_addr() returned %d\n", ret); goto out; } ret = rdma_listen(cm_id, 128); if (ret) { - printk(KERN_ERR "RDS/IW: failed to setup listener, " + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " "rdma_listen() returned %d\n", ret); goto out; }