Sophie: kernel-2.6.18-194.11.1.el5 src

kernel-2.6.18-194.11.1.el5.src.rpm

diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/Kconfig linux-2.6.17.noarch.sdp/drivers/infiniband/Kconfig
--- linux-2.6.17.noarch.orig/drivers/infiniband/Kconfig	2006-07-25 17:32:46.000000000 -0400
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/Kconfig	2006-08-30 16:13:20.000000000 -0400
@@ -39,6 +39,8 @@ config INFINIBAND_USER_ACCESS
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
 
+source "drivers/infiniband/ulp/sdp/Kconfig"
+
 source "drivers/infiniband/ulp/srp/Kconfig"
 
 source "drivers/infiniband/ulp/iser/Kconfig"
diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/Makefile linux-2.6.17.noarch.sdp/drivers/infiniband/Makefile
--- linux-2.6.17.noarch.orig/drivers/infiniband/Makefile	2006-07-25 17:32:49.000000000 -0400
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/Makefile	2006-08-30 16:13:28.000000000 -0400
@@ -2,5 +2,6 @@ obj-$(CONFIG_INFINIBAND)		+= core/
 obj-$(CONFIG_INFINIBAND_MTHCA)		+= hw/mthca/
 obj-$(CONFIG_IPATH_CORE)		+= hw/ipath/
 obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
+obj-$(CONFIG_INFINIBAND_SDP)		+= ulp/sdp/
 obj-$(CONFIG_INFINIBAND_SRP)		+= ulp/srp/
 obj-$(CONFIG_INFINIBAND_ISER)		+= ulp/iser/
diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/Kconfig linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/Kconfig
--- linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/Kconfig	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/Kconfig	2006-07-25 22:31:03.000000000 -0400
@@ -0,0 +1,28 @@
+config INFINIBAND_SDP
+	tristate "Sockets Direct Protocol"
+	depends on INFINIBAND && INFINIBAND_IPOIB
+	---help---
+	  Support for Sockets Direct Protocol (SDP).  This provides
+          sockets semantics over InfiniBand via address family
+          AF_INET_SDP (address family 27).  You can also LD_PRELOAD the
+          libsdp library from <http://openib.org> to have standard
+          sockets applications use SDP.
+
+config INFINIBAND_SDP_DEBUG
+	bool "Sockets Direct Protocol debugging"
+	depends on INFINIBAND_SDP
+	---help---
+	  This option causes debugging code to be compiled into the
+	  SDP driver.  The output can be turned on via the debug_level
+	  module parameter  (which can also be set through sysfs after the
+	  driver is loaded).
+
+config INFINIBAND_SDP_DEBUG_DATA
+        bool "Sockets Direct Protocol data path debugging"
+        depends on INFINIBAND_SDP_DEBUG
+        ---help---
+          This option compiles debugging code into the the data path
+          of the SDP driver.  The output can be turned on via the
+          data_debug_level module parameter; however, even with output
+          turned off, this debugging code will have some performance
+          impact.
diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/Makefile linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/Makefile
--- linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/Makefile	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/Makefile	2006-07-25 22:31:03.000000000 -0400
@@ -0,0 +1,6 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+EXTRA_CFLAGS += -ggdb
+
+obj-$(CONFIG_INFINIBAND_SDP) += ib_sdp.o
+
+ib_sdp-objs := sdp_main.o sdp_cma.o sdp_bcopy.o
diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp_bcopy.c linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp_bcopy.c
--- linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp_bcopy.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp_bcopy.c	2006-07-25 22:31:03.000000000 -0400
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "sdp.h"
+
+/* Like tcp_fin */
+static void sdp_fin(struct sock *sk)
+{
+	sdp_dbg(sk, "%s\n", __func__);
+
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(sk, SOCK_DONE);
+
+
+	sk_stream_mem_reclaim(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+
+		/* Do not send POLL_HUP for half duplex close. */
+		if (sk->sk_shutdown == SHUTDOWN_MASK ||
+		    sk->sk_state == TCP_CLOSE)
+			sk_wake_async(sk, 1, POLL_HUP);
+		else
+			sk_wake_async(sk, 1, POLL_IN);
+	}
+}
+
+void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb, u8 mid)
+{
+	struct sdp_buf *tx_req;
+	struct sdp_bsdh *h = (struct sdp_bsdh *)skb_push(skb, sizeof *h);
+	unsigned mseq = ssk->tx_head;
+	int i, rc, frags;
+	dma_addr_t addr;
+	struct device *hwdev;
+	struct ib_sge *sge;
+	struct ib_send_wr *bad_wr;
+
+	h->mid = mid;
+	h->flags = 0; /* TODO: OOB */
+	h->bufs = htons(ssk->rx_head - ssk->rx_tail);
+	h->len = htonl(skb->len);
+	h->mseq = htonl(mseq);
+	h->mseq_ack = htonl(ssk->mseq_ack);
+
+	tx_req = &ssk->tx_ring[mseq & (SDP_TX_SIZE - 1)];
+	tx_req->skb = skb;
+	hwdev = ssk->dma_device;
+	sge = ssk->ibsge;
+	addr = dma_map_single(hwdev,
+			      skb->data, skb->len - skb->data_len,
+			      DMA_TO_DEVICE);
+	tx_req->mapping[0] = addr;
+	
+	/* TODO: proper error handling */
+	BUG_ON(dma_mapping_error(addr));
+
+	sge->addr = (u64)addr;
+	sge->length = skb->len - skb->data_len;
+	sge->lkey = ssk->mr->lkey;
+	frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < frags; ++i) {
+		++sge;
+		addr = dma_map_page(hwdev, skb_shinfo(skb)->frags[i].page,
+				    skb_shinfo(skb)->frags[i].page_offset,
+				    skb_shinfo(skb)->frags[i].size,
+				    DMA_TO_DEVICE);
+		BUG_ON(dma_mapping_error(addr));
+		tx_req->mapping[i + 1] = addr;
+		sge->addr = addr;
+		sge->length = skb_shinfo(skb)->frags[i].size;
+		sge->lkey = ssk->mr->lkey;
+	}
+
+	ssk->tx_wr.next = NULL;
+	ssk->tx_wr.wr_id = ssk->tx_head;
+	ssk->tx_wr.sg_list = ssk->ibsge;
+	ssk->tx_wr.num_sge = frags + 1;
+	ssk->tx_wr.opcode = IB_WR_SEND;
+	ssk->tx_wr.send_flags = IB_SEND_SIGNALED;
+	if (unlikely(mid != SDP_MID_DATA))
+		ssk->tx_wr.send_flags |= IB_SEND_SOLICITED;
+	rc = ib_post_send(ssk->qp, &ssk->tx_wr, &bad_wr);
+	BUG_ON(rc);
+	++ssk->tx_head;
+	--ssk->bufs;
+	ssk->remote_credits = ssk->rx_head - ssk->rx_tail;
+}
+
+struct sk_buff *sdp_send_completion(struct sdp_sock *ssk, int mseq)
+{
+	struct device *hwdev;
+	struct sdp_buf *tx_req;
+	struct sk_buff *skb;
+	int i, frags;
+
+	if (unlikely(mseq != ssk->tx_tail)) {
+		printk(KERN_WARNING "Bogus send completion id %d tail %d\n",
+			mseq, ssk->tx_tail);
+		return NULL;
+	}
+
+	hwdev = ssk->dma_device;
+        tx_req = &ssk->tx_ring[mseq & (SDP_TX_SIZE - 1)];
+	skb = tx_req->skb;
+	dma_unmap_single(hwdev, tx_req->mapping[0], skb->len - skb->data_len,
+			 DMA_TO_DEVICE);
+	frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < frags; ++i) {
+		dma_unmap_page(hwdev, tx_req->mapping[i + 1],
+			       skb_shinfo(skb)->frags[i].size,
+			       DMA_TO_DEVICE);
+	}
+
+	++ssk->tx_tail;
+	return skb;
+}
+
+
+static void sdp_post_recv(struct sdp_sock *ssk)
+{
+	struct sdp_buf *rx_req;
+	int i, rc, frags;
+	dma_addr_t addr;
+	struct device *hwdev;
+	struct ib_sge *sge;
+	struct ib_recv_wr *bad_wr;
+	struct sk_buff *skb;
+	struct page *page;
+	skb_frag_t *frag;
+	struct sdp_bsdh *h;
+	int id = ssk->rx_head;
+
+	/* Now, allocate and repost recv */
+	/* TODO: allocate from cache */
+	skb = sk_stream_alloc_skb(&ssk->isk.sk, sizeof(struct sdp_bsdh),
+				  GFP_KERNEL);
+	/* FIXME */
+	BUG_ON(!skb);
+	h = (struct sdp_bsdh *)skb_push(skb, sizeof *h);
+	for (i = 0; i < SDP_MAX_SEND_SKB_FRAGS; ++i) {
+		page = alloc_pages(GFP_KERNEL, 0);
+		BUG_ON(!page);
+		frag = &skb_shinfo(skb)->frags[i];
+		frag->page                = page;
+		frag->page_offset         = 0;
+		frag->size                = PAGE_SIZE;
+		++skb_shinfo(skb)->nr_frags;
+		skb->len += PAGE_SIZE;
+		skb->data_len += PAGE_SIZE;
+		skb->truesize += PAGE_SIZE;
+	}
+
+        rx_req = ssk->rx_ring + (id & (SDP_RX_SIZE - 1));
+	rx_req->skb = skb;
+	hwdev = ssk->dma_device;
+	sge = ssk->ibsge;
+	addr = dma_map_single(hwdev, h, skb_headlen(skb),
+			      DMA_FROM_DEVICE);
+	BUG_ON(dma_mapping_error(addr));
+
+	rx_req->mapping[0] = addr;
+	
+	/* TODO: proper error handling */
+	sge->addr = (u64)addr;
+	sge->length = skb_headlen(skb);
+	sge->lkey = ssk->mr->lkey;
+	frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < frags; ++i) {
+		++sge;
+		addr = dma_map_page(hwdev, skb_shinfo(skb)->frags[i].page,
+				    skb_shinfo(skb)->frags[i].page_offset,
+				    skb_shinfo(skb)->frags[i].size,
+				    DMA_FROM_DEVICE);
+		BUG_ON(dma_mapping_error(addr));
+		rx_req->mapping[i + 1] = addr;
+		sge->addr = addr;
+		sge->length = skb_shinfo(skb)->frags[i].size;
+		sge->lkey = ssk->mr->lkey;
+	}
+
+	ssk->rx_wr.next = NULL;
+	ssk->rx_wr.wr_id = id | SDP_OP_RECV;
+	ssk->rx_wr.sg_list = ssk->ibsge;
+	ssk->rx_wr.num_sge = frags + 1;
+	rc = ib_post_recv(ssk->qp, &ssk->rx_wr, &bad_wr);
+	/* TODO */
+	BUG_ON(rc);
+	++ssk->rx_head;
+}
+
+void sdp_post_recvs(struct sdp_sock *ssk)
+{
+	int rmem = atomic_read(&ssk->isk.sk.sk_rmem_alloc);
+
+	if (unlikely(!ssk->id))
+		return;
+
+	while ((likely(ssk->rx_head - ssk->rx_tail < SDP_RX_SIZE) &&
+		(ssk->rx_head - ssk->rx_tail - SDP_MIN_BUFS) *
+		SDP_MAX_SEND_SKB_FRAGS * PAGE_SIZE + rmem <
+		ssk->isk.sk.sk_rcvbuf * 0x10) ||
+	       unlikely(ssk->rx_head - ssk->rx_tail < SDP_MIN_BUFS))
+		sdp_post_recv(ssk);
+}
+
+struct sk_buff *sdp_recv_completion(struct sdp_sock *ssk, int id)
+{
+	struct sdp_buf *rx_req;
+	struct device *hwdev;
+	struct sk_buff *skb;
+	int i, frags;
+
+	if (unlikely(id != ssk->rx_tail)) {
+		printk(KERN_WARNING "Bogus recv completion id %d tail %d\n",
+			id, ssk->rx_tail);
+		return NULL;
+	}
+
+	hwdev = ssk->dma_device;
+        rx_req = &ssk->rx_ring[id & (SDP_RX_SIZE - 1)];
+	skb = rx_req->skb;
+	dma_unmap_single(hwdev, rx_req->mapping[0], skb_headlen(skb),
+			 DMA_FROM_DEVICE);
+	frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < frags; ++i)
+		dma_unmap_page(hwdev, rx_req->mapping[i + 1],
+			       skb_shinfo(skb)->frags[i].size,
+			       DMA_TO_DEVICE);
+	++ssk->rx_tail;
+	--ssk->remote_credits;
+	return skb;
+}
+
+/* Here because I do not want queue to fail. */
+static inline int sdp_sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int skb_len;
+
+	skb_set_owner_r(skb, sk);
+
+	skb_len = skb->len;
+
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb_len);
+	return 0;
+}
+
+static inline void update_send_head(struct sock *sk, struct sk_buff *skb)
+{
+	sk->sk_send_head = skb->next;
+	if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
+		sk->sk_send_head = NULL;
+}
+
+void sdp_post_sends(struct sdp_sock *ssk, int nonagle)
+{
+	/* TODO: nonagle */
+	struct sk_buff *skb;
+	int c;
+
+	if (unlikely(!ssk->id))
+		return;
+
+	while (ssk->bufs > SDP_MIN_BUFS &&
+	       ssk->tx_head - ssk->tx_tail < SDP_TX_SIZE &&
+	       (skb = ssk->isk.sk.sk_send_head)) {
+		update_send_head(&ssk->isk.sk, skb);
+		__skb_dequeue(&ssk->isk.sk.sk_write_queue);
+		sdp_post_send(ssk, skb, SDP_MID_DATA);
+	}
+	c = ssk->remote_credits;
+	if (likely(c > SDP_MIN_BUFS))
+		c *= 2;
+
+	if (unlikely(c < ssk->rx_head - ssk->rx_tail) &&
+	    likely(ssk->bufs > 1) &&
+	    likely(ssk->tx_head - ssk->tx_tail < SDP_TX_SIZE)) {
+		skb = sk_stream_alloc_skb(&ssk->isk.sk,
+					  sizeof(struct sdp_bsdh),
+					  GFP_KERNEL);
+		/* FIXME */
+		BUG_ON(!skb);
+		sdp_post_send(ssk, skb, SDP_MID_DATA);
+	}
+
+	if (unlikely((1 << ssk->isk.sk.sk_state) &
+			(TCPF_FIN_WAIT1 | TCPF_LAST_ACK)) &&
+		!ssk->isk.sk.sk_send_head &&
+		ssk->bufs) {
+		skb = sk_stream_alloc_skb(&ssk->isk.sk,
+					  sizeof(struct sdp_bsdh),
+					  GFP_KERNEL);
+		/* FIXME */
+		BUG_ON(!skb);
+		sdp_post_send(ssk, skb, SDP_MID_DISCONN);
+		if (ssk->isk.sk.sk_state == TCP_FIN_WAIT1)
+			ssk->isk.sk.sk_state = TCP_FIN_WAIT2;
+		else
+			ssk->isk.sk.sk_state = TCP_CLOSING;
+	}
+}
+
+static void sdp_handle_wc(struct sdp_sock *ssk, struct ib_wc *wc)
+{
+	struct sk_buff *skb;
+	struct sdp_bsdh *h;
+
+	if (wc->wr_id & SDP_OP_RECV) {
+		skb = sdp_recv_completion(ssk, wc->wr_id);
+		if (unlikely(!skb))
+			return;
+
+		if (unlikely(wc->status)) {
+			if (wc->status != IB_WC_WR_FLUSH_ERR)
+				sdp_dbg(&ssk->isk.sk,
+					"Recv completion with error. "
+					"Status %d\n", wc->status);
+			__kfree_skb(skb);
+			sdp_set_error(&ssk->isk.sk, -ECONNRESET);
+			wake_up(&ssk->wq);
+		} else {
+			/* TODO: handle msg < bsdh */
+			sdp_dbg(&ssk->isk.sk,
+				"Recv completion. ID %d Length %d\n",
+				(int)wc->wr_id, wc->byte_len);
+			skb->len = wc->byte_len;
+			skb->data_len = wc->byte_len - sizeof(struct sdp_bsdh);
+			if (unlikely(skb->data_len < 0)) {
+				printk("SDP: FIXME len %d\n", wc->byte_len);
+			}
+			h = (struct sdp_bsdh *)skb->data;
+			skb->h.raw = skb->data;
+			ssk->mseq_ack = ntohl(h->mseq);
+			if (ssk->mseq_ack != (int)wc->wr_id)
+				printk("SDP BUG! mseq %d != wrid %d\n",
+						ssk->mseq_ack, (int)wc->wr_id);
+			ssk->bufs = ntohl(h->mseq_ack) - ssk->tx_head + 1 +
+				ntohs(h->bufs);
+
+			if (likely(h->mid == SDP_MID_DATA) &&
+			    likely(skb->data_len > 0)) {
+				skb_pull(skb, sizeof(struct sdp_bsdh));
+				/* TODO: queue can fail? */
+				/* TODO: free unused fragments */
+				sdp_sock_queue_rcv_skb(&ssk->isk.sk, skb);
+			} else if (likely(h->mid == SDP_MID_DATA)) {
+				__kfree_skb(skb);
+			} else if (h->mid == SDP_MID_DISCONN) {
+				skb_pull(skb, sizeof(struct sdp_bsdh));
+				/* TODO: free unused fragments */
+				/* this will wake recvmsg */
+				sdp_sock_queue_rcv_skb(&ssk->isk.sk, skb);
+				sdp_fin(&ssk->isk.sk);
+			} else {
+				/* TODO: Handle other messages */
+				printk("SDP: FIXME MID %d\n", h->mid);
+				__kfree_skb(skb);
+			}
+			sdp_post_recvs(ssk);
+		}
+	} else {
+		skb = sdp_send_completion(ssk, wc->wr_id);
+		if (unlikely(!skb))
+			return;
+		sk_stream_free_skb(&ssk->isk.sk, skb);
+		if (unlikely(wc->status)) {
+			if (wc->status != IB_WC_WR_FLUSH_ERR)
+				sdp_dbg(&ssk->isk.sk,
+					"Send completion with error. "
+					"Status %d\n", wc->status);
+			sdp_set_error(&ssk->isk.sk, -ECONNRESET);
+			wake_up(&ssk->wq);
+		}
+
+		sk_stream_write_space(&ssk->isk.sk);
+	}
+
+	if (likely(!wc->status)) {
+		sdp_post_recvs(ssk);
+		sdp_post_sends(ssk, 0);
+	}
+
+	if (ssk->time_wait && !ssk->isk.sk.sk_send_head &&
+	    ssk->tx_head == ssk->tx_tail) {
+		ssk->time_wait = 0;
+		ssk->isk.sk.sk_state = TCP_CLOSE;
+		sdp_dbg(&ssk->isk.sk, "%s: destroy in time wait state\n",
+			__func__);
+		queue_work(sdp_workqueue, &ssk->destroy_work);
+	}
+}
+
+void sdp_completion_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct sock *sk = cq_context;
+	struct sdp_sock *ssk = sdp_sk(sk);
+	schedule_work(&ssk->work);
+}
+
+void sdp_work(void *data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct ib_cq *cq;
+	int n, i;
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	cq = ssk->cq;
+	if (unlikely(!cq))
+		return;
+
+	do {
+		lock_sock(sk);
+		n = ib_poll_cq(cq, SDP_NUM_WC, ssk->ibwc);
+		for (i = 0; i < n; ++i) {
+			sdp_handle_wc(ssk, ssk->ibwc + i);
+		}
+		release_sock(sk);
+	} while (n == SDP_NUM_WC);
+	sk_stream_mem_reclaim(sk);
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	do {
+		lock_sock(sk);
+		n = ib_poll_cq(cq, SDP_NUM_WC, ssk->ibwc);
+		for (i = 0; i < n; ++i) {
+			sdp_handle_wc(ssk, ssk->ibwc + i);
+		}
+		release_sock(sk);
+	} while (n == SDP_NUM_WC);
+}
diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp_cma.c linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp_cma.c
--- linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp_cma.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp_cma.c	2006-07-25 22:31:03.000000000 -0400
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#include <asm/semaphore.h>
+#include <linux/device.h>
+#include <linux/in.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/time.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <net/tcp_states.h>
+#include "sdp_socket.h"
+#include "sdp.h"
+
+union cma_ip_addr {
+        struct in6_addr ip6;
+        struct {
+                __u32 pad[3];
+                __u32 addr;
+        } ip4;
+};
+
+#define SDP_MAJV_MINV 0x22
+
+/* TODO: too much? Can I avoid having the src/dst and port here? */
+struct sdp_hh {
+	struct sdp_bsdh bsdh;
+	u8 majv_minv;
+	u8 ipv_cap;
+	u8 rsvd1;
+	u8 max_adverts;
+	__u32 desremrcvsz;
+	__u32 localrcvsz;
+	__u16 port;
+	__u16 rsvd2;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+};
+
+struct sdp_hah {
+	struct sdp_bsdh bsdh;
+	u8 majv_minv;
+	u8 ipv_cap;
+	u8 rsvd1;
+	u8 ext_max_adverts;
+	__u32 actrcvsz;
+};
+
+static void sdp_cq_event_handler(struct ib_event *event, void *data)
+{
+}
+
+static void sdp_qp_event_handler(struct ib_event *event, void *data)
+{
+}
+
+int sdp_init_qp(struct sock *sk, struct rdma_cm_id *id)
+{
+	struct ib_qp_init_attr qp_init_attr = {
+		.event_handler = sdp_qp_event_handler,
+		.cap.max_send_wr = SDP_TX_SIZE,
+		.cap.max_send_sge = SDP_MAX_SEND_SKB_FRAGS + 1, /* TODO */
+		.cap.max_recv_wr = SDP_RX_SIZE,
+		.cap.max_recv_sge = SDP_MAX_SEND_SKB_FRAGS + 1, /* TODO */
+        	.sq_sig_type = IB_SIGNAL_REQ_WR,
+        	.qp_type = IB_QPT_RC,
+	};
+	struct ib_device *device = id->device;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	int rc;
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	sdp_sk(sk)->tx_ring = kmalloc(sizeof *sdp_sk(sk)->tx_ring * SDP_TX_SIZE,
+				      GFP_KERNEL);
+	if (!sdp_sk(sk)->tx_ring) {
+		rc = -ENOMEM;
+		sdp_warn(sk, "Unable to allocate TX Ring size %zd.\n",
+			 sizeof *sdp_sk(sk)->tx_ring * SDP_TX_SIZE);
+		goto err_tx;
+	}
+
+	sdp_sk(sk)->rx_ring = kmalloc(sizeof *sdp_sk(sk)->rx_ring * SDP_RX_SIZE,
+				      GFP_KERNEL);
+	if (!sdp_sk(sk)->rx_ring) {
+		rc = -ENOMEM;
+		sdp_warn(sk, "Unable to allocate RX Ring size %zd.\n",
+			 sizeof *sdp_sk(sk)->rx_ring * SDP_TX_SIZE);
+		goto err_rx;
+	}
+
+	pd = ib_alloc_pd(device);
+	if (IS_ERR(pd)) {
+		rc = PTR_ERR(pd);
+		sdp_warn(sk, "Unable to allocate PD: %d.\n", rc);
+		goto err_pd;
+	}
+	
+        sdp_sk(sk)->mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
+        if (IS_ERR(sdp_sk(sk)->mr)) {
+                rc = PTR_ERR(sdp_sk(sk)->mr);
+		sdp_warn(sk, "Unable to get dma MR: %d.\n", rc);
+                goto err_mr;
+        }
+
+	INIT_WORK(&sdp_sk(sk)->work, sdp_work, sdp_sk(sk));
+
+	cq = ib_create_cq(device, sdp_completion_handler, sdp_cq_event_handler,
+			  sk, SDP_TX_SIZE + SDP_RX_SIZE);
+
+	if (IS_ERR(cq)) {
+		rc = PTR_ERR(cq);
+		sdp_warn(sk, "Unable to allocate CQ: %d.\n", rc);
+		goto err_cq;
+	}
+
+        qp_init_attr.send_cq = qp_init_attr.recv_cq = cq;
+
+	rc = rdma_create_qp(id, pd, &qp_init_attr);
+	if (rc) {
+		sdp_warn(sk, "Unable to create QP: %d.\n", rc);
+		goto err_qp;
+	}
+	sdp_sk(sk)->cq = cq;
+	sdp_sk(sk)->qp = id->qp;
+	sdp_sk(sk)->dma_device = device->dma_device;
+
+	init_waitqueue_head(&sdp_sk(sk)->wq);
+
+	sdp_post_recvs(sdp_sk(sk));
+
+	sdp_dbg(sk, "%s done\n", __func__);
+	return 0;
+
+err_qp:
+	ib_destroy_cq(cq);
+err_cq:
+	ib_dereg_mr(sdp_sk(sk)->mr);
+err_mr:
+	ib_dealloc_pd(pd);
+err_pd:
+	kfree(sdp_sk(sk)->rx_ring);
+err_rx:
+	kfree(sdp_sk(sk)->tx_ring);
+err_tx:
+	return rc;
+}
+
+int sdp_connect_handler(struct sock *sk, struct rdma_cm_id *id,
+		       	struct rdma_cm_event *event)
+{
+	struct sockaddr_in *dst_addr;
+	struct sock *child;
+	struct sdp_hh *h;
+	int rc;
+
+	sdp_dbg(sk, "%s %p -> %p\n", __func__, sdp_sk(sk)->id, id);
+
+        child = sk_clone(sk, GFP_KERNEL);
+        if (!child) {
+                return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&sdp_sk(child)->accept_queue);
+	INIT_LIST_HEAD(&sdp_sk(child)->backlog_queue);
+	INIT_WORK(&sdp_sk(child)->time_wait_work, sdp_time_wait_work, child);
+	INIT_WORK(&sdp_sk(child)->destroy_work, sdp_destroy_work, child);
+
+	dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
+	inet_sk(child)->dport = dst_addr->sin_port;
+	inet_sk(child)->daddr = dst_addr->sin_addr.s_addr;
+
+	bh_unlock_sock(child);
+	__sock_put(child);
+
+	rc = sdp_init_qp(child, id);
+	if (rc) {
+		sk_common_release(child);
+		return rc;
+	}
+
+	h = event->private_data;
+	sdp_sk(child)->bufs = ntohs(h->bsdh.bufs);
+	sdp_sk(child)->xmit_size_goal = ntohl(h->localrcvsz) -
+		sizeof(struct sdp_bsdh);
+
+	sdp_dbg(child, "%s bufs %d xmit_size_goal %d\n", __func__,
+		sdp_sk(child)->bufs,
+		sdp_sk(child)->xmit_size_goal);
+
+	id->context = child;
+	sdp_sk(child)->id = id;
+
+	list_add_tail(&sdp_sk(child)->backlog_queue, &sdp_sk(sk)->backlog_queue);
+	sdp_sk(child)->parent = sk;
+
+	/* child->sk_write_space(child); */
+	/* child->sk_data_ready(child, 0); */
+	sk->sk_data_ready(sk, 0);
+
+	return 0;
+}
+
+static int sdp_response_handler(struct sock *sk, struct rdma_cm_event *event)
+{
+	struct sdp_hah *h;
+	sdp_dbg(sk, "%s\n", __func__);
+
+	sk->sk_state = TCP_ESTABLISHED;
+
+	/* TODO: If SOCK_KEEPOPEN set, need to reset and start
+	   keepalive timer here */
+
+	if (sock_flag(sk, SOCK_DEAD))
+		return 0;
+
+	h = event->private_data;
+	sdp_sk(sk)->bufs = ntohs(h->bsdh.bufs);
+	sdp_sk(sk)->xmit_size_goal = ntohl(h->actrcvsz) -
+		sizeof(struct sdp_bsdh);
+
+	sdp_dbg(sk, "%s bufs %d xmit_size_goal %d\n", __func__,
+		sdp_sk(sk)->bufs,
+		sdp_sk(sk)->xmit_size_goal);
+
+	ib_req_notify_cq(sdp_sk(sk)->cq, IB_CQ_NEXT_COMP);
+
+	sk->sk_state_change(sk);
+	sk_wake_async(sk, 0, POLL_OUT);
+	return 0;
+}
+
+int sdp_connected_handler(struct sock *sk, struct rdma_cm_event *event)
+{
+	struct sock *parent;
+	sdp_dbg(sk, "%s\n", __func__);
+
+	parent = sdp_sk(sk)->parent;
+	BUG_ON(!parent);
+
+	sk->sk_state = TCP_ESTABLISHED;
+
+	/* TODO: If SOCK_KEEPOPEN set, need to reset and start
+	   keepalive timer here */
+
+	if (sock_flag(sk, SOCK_DEAD))
+		return 0;
+
+	lock_sock(parent);
+#if 0
+	/* TODO: backlog */
+	if (sk_acceptq_is_full(parent)) {
+		sdp_dbg(parent, "%s ECONNREFUSED: parent accept queue full: %d > %d\n", __func__, parent->sk_ack_backlog, parent->sk_max_ack_backlog);
+		release_sock(parent);
+		return -ECONNREFUSED;
+	}
+#endif
+	sk_acceptq_added(parent);
+	sdp_dbg(parent, "%s child connection established\n", __func__);
+	list_del_init(&sdp_sk(sk)->backlog_queue);
+	list_add_tail(&sdp_sk(sk)->accept_queue, &sdp_sk(parent)->accept_queue);
+
+	parent->sk_state_change(parent);
+	sk_wake_async(parent, 0, POLL_OUT);
+	release_sock(parent);
+
+	return 0;
+}
+
+void sdp_disconnected_handler(struct sock *sk)
+{
+	sdp_dbg(sk, "%s\n", __func__);
+}
+
+int sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct rdma_conn_param conn_param;
+	struct sock *parent = NULL;
+	struct sock *child = NULL;
+	struct sock *sk;
+	struct sdp_hah hah;
+	struct sdp_hh hh;
+	
+	int rc = 0;
+
+	sk = id->context;
+	if (!sk) {
+		sdp_dbg(NULL, "cm_id is being torn down, event %d\n",
+		       	event->event);
+		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST ?
+			-EINVAL : 0;
+	}
+
+	lock_sock(sk);
+	sdp_dbg(sk, "%s event %d id %p\n", __func__, event->event, id);
+	if (!sdp_sk(sk)->id) {
+		sdp_dbg(sk, "socket is being torn down\n");
+		rc = event->event == RDMA_CM_EVENT_CONNECT_REQUEST ?
+			-EINVAL : 0;
+		release_sock(sk);
+		return rc;
+	}
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_RESOLVED\n");
+		rc = rdma_resolve_route(id, SDP_ROUTE_TIMEOUT);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_ERROR\n");
+		rc = -ENETUNREACH;
+		break;
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_RESOLVED : %p\n", id);
+		rc = sdp_init_qp(sk, id);
+		if (rc)
+			break;
+		sdp_sk(sk)->remote_credits = sdp_sk(sk)->rx_head -
+			sdp_sk(sk)->rx_tail;
+		memset(&hh, 0, sizeof hh);
+		hh.bsdh.mid = SDP_MID_HELLO;
+		hh.bsdh.bufs = htons(sdp_sk(sk)->remote_credits);
+		hh.majv_minv = SDP_MAJV_MINV;
+		hh.localrcvsz = hh.desremrcvsz = htonl(SDP_MAX_SEND_SKB_FRAGS *
+			PAGE_SIZE + sizeof(struct sdp_bsdh));
+		hh.max_adverts = 0x1;
+
+		memset(&conn_param, 0, sizeof conn_param);
+		conn_param.private_data_len = sizeof hh;
+		conn_param.private_data = &hh;
+		conn_param.responder_resources = 4 /* TODO */;
+		conn_param.initiator_depth = 4 /* TODO */;
+		conn_param.retry_count = SDP_RETRY_COUNT;
+		rc = rdma_connect(id, &conn_param);
+		break;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_ERROR : %p\n", id);
+		rc = -ETIMEDOUT;
+		break;
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_REQUEST\n");
+		rc = sdp_connect_handler(sk, id, event);
+		if (rc) {
+			rdma_reject(id, NULL, 0);
+			break;
+		}
+		child = id->context;
+		sdp_sk(child)->remote_credits = sdp_sk(child)->rx_head -
+			sdp_sk(child)->rx_tail;
+		memset(&hah, 0, sizeof hah);
+		hah.bsdh.mid = SDP_MID_HELLO_ACK;
+		hah.bsdh.bufs = htons(sdp_sk(child)->remote_credits);
+		hah.majv_minv = SDP_MAJV_MINV;
+		hah.actrcvsz = htonl(SDP_MAX_SEND_SKB_FRAGS * PAGE_SIZE +
+				     sizeof(struct sdp_bsdh));
+		memset(&conn_param, 0, sizeof conn_param);
+		conn_param.private_data_len = sizeof hah;
+		conn_param.private_data = &hah;
+		conn_param.responder_resources = 4 /* TODO */;
+		conn_param.initiator_depth = 4 /* TODO */;
+		conn_param.retry_count = SDP_RETRY_COUNT;
+		rc = rdma_accept(id, &conn_param);
+		if (rc) {
+			sdp_sk(child)->id = NULL;
+			id->qp = NULL;
+			id->context = NULL;
+			parent = sdp_sk(child)->parent; /* TODO: hold ? */
+		}
+		break;
+	case RDMA_CM_EVENT_CONNECT_RESPONSE:
+		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_RESPONSE\n");
+		rc = sdp_response_handler(sk, event);
+		if (rc)
+			rdma_reject(id, NULL, 0);
+		else
+			rc = rdma_accept(id, NULL);
+		break;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_ERROR\n");
+		rc = -ETIMEDOUT;
+		break;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		sdp_dbg(sk, "RDMA_CM_EVENT_UNREACHABLE\n");
+		rc = -ENETUNREACH;
+		break;
+	case RDMA_CM_EVENT_REJECTED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_REJECTED\n");
+		rc = -ECONNREFUSED;
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ESTABLISHED\n");
+		rc = sdp_connected_handler(sk, event);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_DISCONNECTED\n");
+		rdma_disconnect(id);
+		sdp_disconnected_handler(sk);
+		rc = -ECONNRESET;
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		sdp_warn(sk, "RDMA_CM_EVENT_DEVICE_REMOVAL\n");
+		sdp_disconnected_handler(sk);
+		rc = -ENETRESET;
+		break;
+	default:
+		printk(KERN_ERR "SDP: Unexpected CMA event: %d\n",
+		       event->event);
+		rc = -ECONNABORTED;
+		break;
+	}
+
+	sdp_dbg(sk, "%s event %d handled\n", __func__, event->event);
+
+	if (rc && sdp_sk(sk)->id == id) {
+		child = sk;
+		sdp_sk(sk)->id = NULL;
+		id->qp = NULL;
+		id->context = NULL;
+		sdp_set_error(sk, rc);
+		parent = sdp_sk(sk)->parent;
+	}
+
+	release_sock(sk);
+
+	sdp_dbg(sk, "event %d done. status %d\n", event->event, rc);
+
+	if (parent) {
+		sdp_dbg(parent, "deleting child %d done. status %d\n", event->event, rc);
+		lock_sock(parent);
+		list_del_init(&sdp_sk(child)->backlog_queue);
+		if (!list_empty(&sdp_sk(child)->accept_queue)) {
+			list_del_init(&sdp_sk(child)->accept_queue);
+			sk_acceptq_removed(parent);
+		}
+		release_sock(parent);
+		sk_common_release(child);
+	}
+	return rc;
+}
diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp.h linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp.h
--- linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp.h	2006-07-25 22:31:03.000000000 -0400
@@ -0,0 +1,165 @@
+#ifndef _SDP_H_
+#define _SDP_H_
+
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <net/inet_sock.h>
+#include <net/tcp.h> /* For urgent data flags */
+#include <rdma/ib_verbs.h>
+
+#define sdp_printk(level, sk, format, arg...)                \
+	printk(level "sdp_sock(%d:%d): " format,             \
+	       (sk) ? inet_sk(sk)->num : -1,                 \
+	       (sk) ? ntohs(inet_sk(sk)->dport) : -1, ## arg)
+#define sdp_warn(sk, format, arg...)                         \
+	sdp_printk(KERN_WARNING, sk, format , ## arg)
+
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG
+extern int sdp_debug_level;
+
+#define sdp_dbg(sk, format, arg...)                          \
+	do {                                                 \
+		if (sdp_debug_level > 0)                     \
+		sdp_printk(KERN_DEBUG, sk, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_SDP_DEBUG */
+#define sdp_dbg(priv, format, arg...)                        \
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_SDP_DEBUG */
+
+#define SDP_RESOLVE_TIMEOUT 1000
+#define SDP_ROUTE_TIMEOUT 1000
+#define SDP_RETRY_COUNT 5
+
+#define SDP_TX_SIZE 0x40
+#define SDP_RX_SIZE 0x40
+
+#define SDP_MAX_SEND_SKB_FRAGS (PAGE_SIZE > 0x8000 ? 1 : 0x8000 / PAGE_SIZE)
+
+#define SDP_NUM_WC 4
+
+#define SDP_OP_RECV 0x800000000LL
+
+enum sdp_mid {
+	SDP_MID_HELLO = 0x0,
+	SDP_MID_HELLO_ACK = 0x1,
+	SDP_MID_DISCONN = 0x2,
+	SDP_MID_DATA = 0xFF,
+};
+
+enum {
+	SDP_MIN_BUFS = 2
+};
+
+struct rdma_cm_id;
+struct rdma_cm_event;
+
+struct sdp_bsdh {
+	u8 mid;
+	u8 flags;
+	__u16 bufs;
+	__u32 len;
+	__u32 mseq;
+	__u32 mseq_ack;
+};
+
+struct sdp_buf {
+        struct sk_buff *skb;
+        dma_addr_t      mapping[SDP_MAX_SEND_SKB_FRAGS + 1];
+};
+
+struct sdp_sock {
+	/* sk has to be the first member of inet_sock */
+	struct inet_sock isk;
+	struct list_head accept_queue;
+	struct list_head backlog_queue;
+	struct sock *parent;
+	/* rdma specific */
+	struct rdma_cm_id *id;
+	struct ib_qp *qp;
+	struct ib_cq *cq;
+	struct ib_mr *mr;
+	struct device *dma_device;
+	/* Like tcp_sock */
+	__u16 urg_data;
+	int offset; /* like seq in tcp */
+
+	int xmit_size_goal;
+	int write_seq;
+	int pushed_seq;
+	int nonagle;
+
+	/* SDP specific */
+	wait_queue_head_t wq;
+
+	struct work_struct time_wait_work;
+	struct work_struct destroy_work;
+
+	int time_wait;
+
+	spinlock_t lock;
+	struct sdp_buf *rx_ring;
+	struct ib_recv_wr rx_wr;
+	unsigned rx_head;
+	unsigned rx_tail;
+	unsigned mseq_ack;
+	unsigned bufs;
+
+	int               remote_credits;
+
+	spinlock_t        tx_lock;
+	struct sdp_buf   *tx_ring;
+	unsigned          tx_head;
+	unsigned          tx_tail;
+	struct ib_send_wr tx_wr;
+
+	struct ib_sge ibsge[SDP_MAX_SEND_SKB_FRAGS + 1];
+	struct ib_wc  ibwc[SDP_NUM_WC];
+	struct work_struct work;
+};
+
+extern struct proto sdp_proto;
+extern struct workqueue_struct *sdp_workqueue;
+
+static inline struct sdp_sock *sdp_sk(const struct sock *sk)
+{
+	        return (struct sdp_sock *)sk;
+}
+
+static inline void sdp_set_error(struct sock *sk, int err)
+{
+	sk->sk_err = -err;
+	if (sk->sk_socket)
+		sk->sk_socket->state = SS_UNCONNECTED;
+
+	sk->sk_state = TCP_CLOSE;
+
+	if (sdp_sk(sk)->time_wait) {
+		sdp_dbg(sk, "%s: destroy in time wait state\n", __func__);
+		sdp_sk(sk)->time_wait = 0;
+		queue_work(sdp_workqueue, &sdp_sk(sk)->destroy_work);
+	}
+
+	sk->sk_error_report(sk);
+}
+
+static inline void sdp_set_state(struct sock *sk, int state)
+{
+	sk->sk_state = state;
+}
+
+extern struct workqueue_struct *sdp_workqueue;
+
+int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
+void sdp_close_sk(struct sock *sk);
+void sdp_completion_handler(struct ib_cq *cq, void *cq_context);
+void sdp_work(void *);
+void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb, u8 mid);
+void sdp_post_recvs(struct sdp_sock *ssk);
+void sdp_post_sends(struct sdp_sock *ssk, int nonagle);
+void sdp_destroy_work(void *data);
+void sdp_time_wait_work(void *data);
+struct sk_buff *sdp_recv_completion(struct sdp_sock *ssk, int id);
+struct sk_buff *sdp_send_completion(struct sdp_sock *ssk, int mseq);
+
+#endif
diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp_main.c linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp_main.c
--- linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp_main.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp_main.c	2006-07-25 23:13:33.000000000 -0400
@@ -0,0 +1,1429 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ia64__)
+/* csum_partial_copy_from_user is not exported on ia64.
+   We don't really need it for SDP - skb_copy_to_page happens to call it
+   but for SDP HW checksum is always set, so ... */
+
+#include <linux/errno.h>
+#include <asm/checksum.h>
+static inline
+unsigned int csum_partial_copy_from_user_new (const char *src, char *dst,
+						 int len, unsigned int sum,
+						 int *errp)
+{
+	*errp = -EINVAL;
+	return 0;
+}
+
+#define csum_partial_copy_from_user csum_partial_copy_from_user_new
+#endif
+
+#include <linux/tcp.h>
+#include <asm/ioctls.h>
+#include <linux/workqueue.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_verbs.h>
+/* TODO: remove when sdp_socket.h becomes part of include/linux/socket.h */
+#include "sdp_socket.h"
+#include "sdp.h"
+#include <linux/delay.h>
+
+MODULE_AUTHOR("Michael S. Tsirkin");
+MODULE_DESCRIPTION("InfiniBand SDP module");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG
+int sdp_debug_level;
+
+module_param_named(debug_level, sdp_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0.");
+#endif
+
+struct workqueue_struct *sdp_workqueue;
+
+static int sdp_get_port(struct sock *sk, unsigned short snum)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct sockaddr_in *src_addr;
+	int rc;
+
+	struct sockaddr_in addr = {
+		.sin_family = AF_INET,
+		.sin_port = htons(snum),
+		.sin_addr.s_addr = inet_sk(sk)->rcv_saddr,
+	};
+
+	sdp_dbg(sk, "%s: %u.%u.%u.%u:%hu\n", __func__,
+		NIPQUAD(addr.sin_addr.s_addr), ntohs(addr.sin_port));
+
+	if (!ssk->id)
+		ssk->id = rdma_create_id(sdp_cma_handler, sk, RDMA_PS_SDP);
+
+	if (!ssk->id)
+	       return -ENOMEM;
+
+	/* IP core seems to bind many times to the same address */
+	/* TODO: I don't really understand why. Find out. */
+	if (!memcmp(&addr, &ssk->id->route.addr.src_addr, sizeof addr))
+		return 0;
+
+	rc = rdma_bind_addr(ssk->id, (struct sockaddr *)&addr);
+	if (rc) {
+		rdma_destroy_id(ssk->id);
+		ssk->id = NULL;
+		return rc;
+	}
+
+	src_addr = (struct sockaddr_in *)&(ssk->id->route.addr.src_addr);
+	inet_sk(sk)->num = ntohs(src_addr->sin_port);
+	return 0;
+}
+
+/* TODO: linger? */
+void sdp_close_sk(struct sock *sk)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct rdma_cm_id *id = NULL;
+	struct ib_pd *pd = NULL;
+	struct ib_cq *cq = NULL;
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	lock_sock(sk);
+
+	sk->sk_send_head = NULL;
+	skb_queue_purge(&sk->sk_write_queue);
+
+	id = ssk->id;
+	if (ssk->id) {
+		id->qp = NULL;
+		ssk->id = NULL;
+		release_sock(sk);
+		rdma_destroy_id(id);
+	} else
+		release_sock(sk);
+
+	if (ssk->qp) {
+		pd = ssk->qp->pd;
+		cq = ssk->cq;
+		sdp_sk(sk)->cq = NULL;
+		flush_scheduled_work();
+		ib_destroy_qp(ssk->qp);
+
+		while (ssk->rx_head != ssk->rx_tail) {
+			struct sk_buff *skb;
+			skb = sdp_recv_completion(ssk, ssk->rx_tail);
+			if (!skb)
+				break;
+			__kfree_skb(skb);
+		}
+		while (ssk->tx_head != ssk->tx_tail) {
+			struct sk_buff *skb;
+			skb = sdp_send_completion(ssk, ssk->tx_tail);
+			if (!skb)
+				break;
+			__kfree_skb(skb);
+		}
+	}
+
+	if (cq) {
+		ib_destroy_cq(cq);
+		flush_scheduled_work();
+	}
+
+	if (ssk->mr)
+		ib_dereg_mr(ssk->mr);
+
+	if (pd)
+		ib_dealloc_pd(pd);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	kfree(sdp_sk(sk)->rx_ring);
+	kfree(sdp_sk(sk)->tx_ring);
+
+	sdp_dbg(sk, "%s done; releasing sock\n", __func__);
+}
+
+static void sdp_destruct(struct sock *sk)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct sdp_sock *s, *t;
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	sdp_close_sk(sk);
+
+	list_for_each_entry_safe(s, t, &ssk->backlog_queue, backlog_queue) {
+		sk_common_release(&s->isk.sk);
+	}
+	list_for_each_entry_safe(s, t, &ssk->accept_queue, accept_queue) {
+		sk_common_release(&s->isk.sk);
+	}
+
+	sdp_dbg(sk, "%s done\n", __func__);
+}
+
+static void sdp_send_active_reset(struct sock *sk, gfp_t priority)
+{
+	sk->sk_prot->disconnect(sk, 0);
+}
+
+/*
+ *	State processing on a close.
+ *	TCP_ESTABLISHED -> TCP_FIN_WAIT1 -> TCP_FIN_WAIT2 -> TCP_CLOSE
+ */
+
+static int sdp_close_state(struct sock *sk)
+{
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		return 0;
+
+	if (sk->sk_state == TCP_ESTABLISHED)
+		sk->sk_state = TCP_FIN_WAIT1;
+	else if (sk->sk_state == TCP_CLOSE_WAIT)
+		sk->sk_state = TCP_LAST_ACK;
+	else
+		return 0;
+	return 1;
+}
+
+/* Like tcp_close */
+static void sdp_close(struct sock *sk, long timeout)
+{
+	struct sk_buff *skb;
+	int data_was_unread = 0;
+
+	lock_sock(sk);
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	sk->sk_shutdown = SHUTDOWN_MASK;
+	if (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_SYN_SENT) {
+		sdp_set_state(sk, TCP_CLOSE);
+
+		/* Special case: stop listening.
+		   This is done by sdp_destruct. */
+		goto adjudge_to_death;
+	}
+
+	/*  We need to flush the recv. buffs.  We do this only on the
+	 *  descriptor close, not protocol-sourced closes, because the
+	 *  reader process may not have drained the data yet!
+	 */
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		data_was_unread = 1;
+		__kfree_skb(skb);
+	}
+
+	sk_stream_mem_reclaim(sk);
+
+	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
+	 * 3.10, we send a RST here because data was lost.  To
+	 * witness the awful effects of the old behavior of always
+	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
+	 * a bulk GET in an FTP client, suspend the process, wait
+	 * for the client to advertise a zero window, then kill -9
+	 * the FTP client, wheee...  Note: timeout is always zero
+	 * in such a case.
+	 */
+	if (data_was_unread) {
+		/* Unread data was tossed, zap the connection. */
+		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
+		sdp_set_state(sk, TCP_CLOSE);
+		sdp_send_active_reset(sk, GFP_KERNEL);
+	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+		/* Check zero linger _after_ checking for unread data. */
+		sk->sk_prot->disconnect(sk, 0);
+		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
+	} else if (sdp_close_state(sk)) {
+		/* We FIN if the application ate all the data before
+		 * zapping the connection.
+		 */
+
+		sdp_post_sends(sdp_sk(sk), 0);
+	}
+
+	sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+	/* It is the last release_sock in its life. It will remove backlog. */
+	release_sock(sk);
+	/* Now socket is owned by kernel and we acquire lock
+	   to finish close. No need to check for user refs.
+	 */
+	lock_sock(sk);
+
+	sock_hold(sk);
+	sock_orphan(sk);
+
+	/*	This is a (useful) BSD violating of the RFC. There is a
+	 *	problem with TCP as specified in that the other end could
+	 *	keep a socket open forever with no application left this end.
+	 *	We use a 3 minute timeout (about the same as BSD) then kill
+	 *	our end. If they send after that then tough - BUT: long enough
+	 *	that we won't make the old 4*rto = almost no time - whoops
+	 *	reset mistake.
+	 *
+	 *	Nope, it was not mistake. It is really desired behaviour
+	 *	f.e. on http servers, when such sockets are useless, but
+	 *	consume significant resources. Let's do it with special
+	 *	linger2	option.					--ANK
+	 */
+
+	if (sk->sk_state == TCP_FIN_WAIT2 &&
+		!sk->sk_send_head &&
+		sdp_sk(sk)->tx_head == sdp_sk(sk)->tx_tail) {
+		sk->sk_state = TCP_CLOSE;
+	}
+
+	if ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)) {
+		sdp_sk(sk)->time_wait = 1;
+		/* TODO: liger2 unimplemented.
+		   We should wait 3.5 * rto. How do I know rto? */
+		/* TODO: tcp_fin_time to get timeout */
+		sdp_dbg(sk, "%s: entering time wait refcnt %d\n", __func__,
+			atomic_read(&sk->sk_refcnt));
+		atomic_inc(sk->sk_prot->orphan_count);
+		queue_delayed_work(sdp_workqueue, &sdp_sk(sk)->time_wait_work,
+				   TCP_FIN_TIMEOUT);
+		goto out;
+	}
+
+	/* TODO: limit number of orphaned sockets.
+	   TCP has sysctl_tcp_mem and sysctl_tcp_max_orphans */
+	sock_put(sk);
+
+	/* Otherwise, socket is reprieved until protocol close. */
+out:
+	sdp_dbg(sk, "%s: last socket put %d\n", __func__,
+		atomic_read(&sk->sk_refcnt));
+	release_sock(sk);
+	sk_common_release(sk);
+}
+
+static int sdp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct sockaddr_in src_addr = {
+		.sin_family = AF_INET,
+		.sin_port = htons(inet_sk(sk)->sport),
+		.sin_addr.s_addr = inet_sk(sk)->saddr,
+	};
+	int rc;
+
+        if (addr_len < sizeof(struct sockaddr_in))
+                return -EINVAL;
+
+        if (uaddr->sa_family != AF_INET)
+                return -EAFNOSUPPORT;
+
+	if (!ssk->id) {
+		rc = sdp_get_port(sk, 0);
+		if (rc)
+			return rc;
+	}
+
+	sdp_dbg(sk, "%s %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n", __func__,
+		NIPQUAD(src_addr.sin_addr.s_addr),
+		ntohs(src_addr.sin_port),
+		NIPQUAD(((struct sockaddr_in *)uaddr)->sin_addr.s_addr),
+		ntohs(((struct sockaddr_in *)uaddr)->sin_port));
+
+	if (!ssk->id) {
+		printk("??? ssk->id == NULL. Ohh\n");
+		return -EINVAL;
+	}
+
+	rc = rdma_resolve_addr(ssk->id, (struct sockaddr *)&src_addr,
+			       uaddr, SDP_RESOLVE_TIMEOUT);
+	if (rc) {
+		sdp_warn(sk, "rdma_resolve_addr failed: %d\n", rc);
+		return rc;
+	}
+
+	sk->sk_state = TCP_SYN_SENT;
+	return 0;
+}
+
+static int sdp_disconnect(struct sock *sk, int flags)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	int rc = 0;
+	sdp_dbg(sk, "%s\n", __func__);
+	if (ssk->id)
+		rc = rdma_disconnect(ssk->id);
+	return rc;
+}
+
+/* Like inet_csk_wait_for_connect */
+static int sdp_wait_for_connect(struct sock *sk, long timeo)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	DEFINE_WAIT(wait);
+	int err;
+
+	sdp_dbg(sk, "%s\n", __func__);
+	/*
+	 * True wake-one mechanism for incoming connections: only
+	 * one process gets woken up, not the 'whole herd'.
+	 * Since we do not 'race & poll' for established sockets
+	 * anymore, the common case will execute the loop only once.
+	 *
+	 * Subtle issue: "add_wait_queue_exclusive()" will be added
+	 * after any current non-exclusive waiters, and we know that
+	 * it will always _stay_ after any new non-exclusive waiters
+	 * because all non-exclusive waiters are added at the
+	 * beginning of the wait-queue. As such, it's ok to "drop"
+	 * our exclusiveness temporarily when we get woken up without
+	 * having to remove and re-insert us on the wait queue.
+	 */
+	for (;;) {
+		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+					  TASK_INTERRUPTIBLE);
+		release_sock(sk);
+		if (list_empty(&ssk->accept_queue)) {
+			sdp_dbg(sk, "%s schedule_timeout\n", __func__);
+			timeo = schedule_timeout(timeo);
+			sdp_dbg(sk, "%s schedule_timeout done\n", __func__);
+		}
+		sdp_dbg(sk, "%s lock_sock\n", __func__);
+		lock_sock(sk);
+		sdp_dbg(sk, "%s lock_sock done\n", __func__);
+		err = 0;
+		if (!list_empty(&ssk->accept_queue))
+			break;
+		err = -EINVAL;
+		if (sk->sk_state != TCP_LISTEN)
+			break;
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
+	}
+	finish_wait(sk->sk_sleep, &wait);
+	sdp_dbg(sk, "%s returns %d\n", __func__, err);
+	return err;
+}
+
+/* Consider using request_sock_queue instead of duplicating all this */
+/* Like inet_csk_accept */
+static struct sock *sdp_accept(struct sock *sk, int flags, int *err)
+{
+	struct sdp_sock *newssk, *ssk;
+	struct sock *newsk;
+	int error;
+
+	sdp_dbg(sk, "%s state %d expected %d *err %d\n", __func__,
+		sk->sk_state, TCP_LISTEN, *err);
+
+	ssk = sdp_sk(sk);
+	lock_sock(sk);
+
+	/* We need to make sure that this socket is listening,
+	 * and that it has something pending.
+	 */
+	error = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out_err;
+
+	/* Find already established connection */
+	if (list_empty(&ssk->accept_queue)) {
+		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+		/* If this is a non blocking socket don't sleep */
+		error = -EAGAIN;
+		if (!timeo)
+			goto out_err;
+
+		error = sdp_wait_for_connect(sk, timeo);
+		if (error)
+			goto out_err;
+	}
+
+	newssk = list_entry(ssk->accept_queue.next, struct sdp_sock, accept_queue);
+	list_del_init(&newssk->accept_queue);
+	newssk->parent = NULL;
+	sk_acceptq_removed(sk);
+	newsk = &newssk->isk.sk;
+	sdp_dbg(sk, "%s: ib_req_notify_cq\n", __func__);
+	ib_req_notify_cq(newssk->cq, IB_CQ_NEXT_COMP);
+	/* TODO: poll cq here */
+out:
+	release_sock(sk);
+	sdp_dbg(sk, "%s: status %d sk %p newsk %p\n", __func__,
+		*err, sk, newsk);
+	return newsk;
+out_err:
+	sdp_dbg(sk, "%s: error %d\n", __func__, error);
+	newsk = NULL;
+	*err = error;
+	goto out;
+}
+
+static int sdp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	sdp_dbg(sk, "%s\n", __func__);
+	/* TODO: Need to handle:
+	         case SIOCINQ:
+		 case SIOCOUTQ:
+		 case SIOCATMARK:
+	 */
+	return -ENOIOCTLCMD;
+}
+
+void sdp_destroy_work(void *data)
+{
+	struct sock *sk = data;
+	sdp_dbg(sk, "%s: refcnt %d\n", __func__, atomic_read(&sk->sk_refcnt));
+
+	cancel_delayed_work(&sdp_sk(sk)->time_wait_work);
+	atomic_dec(sk->sk_prot->orphan_count);
+
+	sock_put(sk);
+}
+
+void sdp_time_wait_work(void *data)
+{
+	struct sock *sk = data;
+	lock_sock(sk);
+	sdp_dbg(sk, "%s\n", __func__);
+
+	if (!sdp_sk(sk)->time_wait) {
+		release_sock(sk);
+		return;
+	}
+
+	sdp_dbg(sk, "%s: refcnt %d\n", __func__, atomic_read(&sk->sk_refcnt));
+
+	sk->sk_state = TCP_CLOSE;
+	sdp_sk(sk)->time_wait = 0;
+	release_sock(sk);
+
+	atomic_dec(sk->sk_prot->orphan_count);
+	sock_put(data);
+}
+
+static int sdp_init_sock(struct sock *sk)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	INIT_LIST_HEAD(&ssk->accept_queue);
+	INIT_LIST_HEAD(&ssk->backlog_queue);
+	INIT_WORK(&ssk->time_wait_work, sdp_time_wait_work, sk);
+	INIT_WORK(&ssk->destroy_work, sdp_destroy_work, sk);
+
+	ssk->tx_head = 1;
+	ssk->tx_tail = 1;
+	ssk->rx_head = 1;
+	ssk->rx_tail = 1;
+	sk->sk_route_caps |= NETIF_F_SG | NETIF_F_NO_CSUM;
+	return 0;
+}
+
+static void sdp_shutdown(struct sock *sk, int how)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+
+	sdp_dbg(sk, "%s\n", __func__);
+	if (!(how & SEND_SHUTDOWN))
+		return;
+
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		return;
+
+	if (sk->sk_state == TCP_ESTABLISHED)
+		sk->sk_state = TCP_FIN_WAIT1;
+	else if (sk->sk_state == TCP_CLOSE_WAIT)
+		sk->sk_state = TCP_LAST_ACK;
+	else
+		return;
+
+	sdp_post_sends(ssk, 0);
+}
+
+static inline void sdp_push_pending_frames(struct sock *sk)
+{
+}
+
+/* SOL_SOCKET level options are handled by sock_setsockopt */
+static int sdp_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int optlen)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	int val;
+	int err = 0;
+
+	sdp_dbg(sk, "%s\n", __func__);
+	if (level != SOL_TCP)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case TCP_NODELAY:
+		if (val) {
+			/* TCP_NODELAY is weaker than TCP_CORK, so that
+			 * this option on corked socket is remembered, but
+			 * it is not activated until cork is cleared.
+			 *
+			 * However, when TCP_NODELAY is set we make
+			 * an explicit push, which overrides even TCP_CORK
+			 * for currently queued segments.
+			 */
+			ssk->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+			sdp_push_pending_frames(sk);
+		} else {
+			ssk->nonagle &= ~TCP_NAGLE_OFF;
+		}
+		break;
+	case TCP_CORK:
+		/* When set indicates to always queue non-full frames.
+		 * Later the user clears this option and we transmit
+		 * any pending partial frames in the queue.  This is
+		 * meant to be used alongside sendfile() to get properly
+		 * filled frames when the user (for example) must write
+		 * out headers with a write() call first and then use
+		 * sendfile to send out the data parts.
+		 *
+		 * TCP_CORK can be set together with TCP_NODELAY and it is
+		 * stronger than TCP_NODELAY.
+		 */
+		if (val) {
+			ssk->nonagle |= TCP_NAGLE_CORK;
+		} else {
+			ssk->nonagle &= ~TCP_NAGLE_CORK;
+			if (ssk->nonagle&TCP_NAGLE_OFF)
+				ssk->nonagle |= TCP_NAGLE_PUSH;
+			sdp_push_pending_frames(sk);
+		}
+		break;
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+/* SOL_SOCKET level options are handled by sock_getsockopt */
+static int sdp_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *option)
+{
+	/* TODO */
+	struct sdp_sock *ssk = sdp_sk(sk);
+	int val, len;
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	if (level != SOL_TCP)
+		return -EOPNOTSUPP;
+
+	if (get_user(len, option))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case TCP_NODELAY:
+		val = !!(ssk->nonagle&TCP_NAGLE_OFF);
+		break;
+	case TCP_CORK:
+		val = !!(ssk->nonagle&TCP_NAGLE_CORK);
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, option))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+/* Like tcp_recv_urg */
+/*
+ *	Handle reading urgent data. BSD has very simple semantics for
+ *	this, no blocking and very strange errors 8)
+ */
+
+static int sdp_recv_urg(struct sock *sk, long timeo,
+			struct msghdr *msg, int len, int flags,
+			int *addr_len)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+
+	/* No URG data to read. */
+	if (sock_flag(sk, SOCK_URGINLINE) || !ssk->urg_data ||
+	    ssk->urg_data == TCP_URG_READ)
+		return -EINVAL;	/* Yes this is right ! */
+
+	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
+		return -ENOTCONN;
+
+	if (ssk->urg_data & TCP_URG_VALID) {
+		int err = 0;
+		char c = ssk->urg_data;
+
+		if (!(flags & MSG_PEEK))
+			ssk->urg_data = TCP_URG_READ;
+
+		/* Read urgent data. */
+		msg->msg_flags |= MSG_OOB;
+
+		if (len > 0) {
+			if (!(flags & MSG_TRUNC))
+				err = memcpy_toiovec(msg->msg_iov, &c, 1);
+			len = 1;
+		} else
+			msg->msg_flags |= MSG_TRUNC;
+
+		return err ? -EFAULT : len;
+	}
+
+	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+		return 0;
+
+	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
+	 * the available implementations agree in this case:
+	 * this call should never block, independent of the
+	 * blocking state of the socket.
+	 * Mike <pall@rz.uni-karlsruhe.de>
+	 */
+	return -EAGAIN;
+}
+
+static inline int sdp_has_urgent_data(struct sk_buff *skb)
+{
+	/* TODO: handle inline urgent data */
+	return 0;
+}
+
+static void sdp_rcv_space_adjust(struct sock *sk)
+{
+	sdp_post_recvs(sdp_sk(sk));
+	sdp_post_sends(sdp_sk(sk), 0);
+}
+
+static unsigned int sdp_current_mss(struct sock *sk, int large_allowed)
+{
+	/* TODO */
+	return PAGE_SIZE;
+}
+
+static int forced_push(struct sdp_sock *sk)
+{
+	/* TODO */
+	return 0;
+}
+
+static void sdp_mark_push(struct sdp_sock *ssk, struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+	ssk->pushed_seq = ssk->write_seq;
+}
+
+static inline int select_size(struct sock *sk, struct sdp_sock *ssk)
+{
+	return 0;
+}
+
+static inline void sdp_push(struct sock *sk, struct sdp_sock *ssk, int flags,
+			    int mss_now, int nonagle)
+{
+	sdp_post_sends(ssk, nonagle);
+}
+
+static inline void skb_entail(struct sock *sk, struct sdp_sock *ssk,
+                              struct sk_buff *skb)
+{
+        skb_header_release(skb);
+        __skb_queue_tail(&sk->sk_write_queue, skb);
+        sk_charge_skb(sk, skb);
+        if (!sk->sk_send_head)
+                sk->sk_send_head = skb;
+        if (ssk->nonagle & TCP_NAGLE_PUSH)
+                ssk->nonagle &= ~TCP_NAGLE_PUSH;
+}
+
+void sdp_push_one(struct sock *sk, unsigned int mss_now)
+{
+}
+
+/* Like tcp_sendmsg */
+/* TODO: check locking */
+#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
+#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
+int sdp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t size)
+{
+	struct iovec *iov;
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct sk_buff *skb;
+	int iovlen, flags;
+	int mss_now, size_goal;
+	int err, copied;
+	long timeo;
+
+	lock_sock(sk);
+	sdp_dbg(sk, "%s\n", __func__);
+
+	flags = msg->msg_flags;
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+			goto out_err;
+
+	/* This should be in poll */
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = sdp_current_mss(sk, !(flags&MSG_OOB));
+	size_goal = ssk->xmit_size_goal;
+
+	/* Ok commence sending. */
+	iovlen = msg->msg_iovlen;
+	iov = msg->msg_iov;
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		goto do_error;
+
+	while (--iovlen >= 0) {
+		int seglen = iov->iov_len;
+		unsigned char __user *from = iov->iov_base;
+
+		iov++;
+
+		while (seglen > 0) {
+			int copy;
+
+			skb = sk->sk_write_queue.prev;
+
+			if (!sk->sk_send_head ||
+			    (copy = size_goal - skb->len) <= 0) {
+
+new_segment:
+				/* Allocate new segment. If the interface is SG,
+				 * allocate skb fitting to single page.
+				 */
+				if (!sk_stream_memory_free(sk))
+					goto wait_for_sndbuf;
+
+				skb = sk_stream_alloc_pskb(sk, select_size(sk, ssk),
+							   0, sk->sk_allocation);
+				if (!skb)
+					goto wait_for_memory;
+
+				/*
+				 * Check whether we can use HW checksum.
+				 */
+				if (sk->sk_route_caps &
+				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
+				     NETIF_F_HW_CSUM))
+					skb->ip_summed = CHECKSUM_HW;
+
+				skb_entail(sk, ssk, skb);
+				copy = size_goal;
+			}
+
+			/* Try to append data to the end of skb. */
+			if (copy > seglen)
+				copy = seglen;
+
+			/* Where to copy to? */
+			if (skb_tailroom(skb) > 0) {
+				/* We have some space in skb head. Superb! */
+				if (copy > skb_tailroom(skb))
+					copy = skb_tailroom(skb);
+				if ((err = skb_add_data(skb, from, copy)) != 0)
+					goto do_fault;
+			} else {
+				int merge = 0;
+				int i = skb_shinfo(skb)->nr_frags;
+				struct page *page = TCP_PAGE(sk);
+				int off = TCP_OFF(sk);
+
+				if (skb_can_coalesce(skb, i, page, off) &&
+				    off != PAGE_SIZE) {
+					/* We can extend the last page
+					 * fragment. */
+					merge = 1;
+				} else if (i == SDP_MAX_SEND_SKB_FRAGS ||
+					   (!i &&
+					   !(sk->sk_route_caps & NETIF_F_SG))) {
+					/* Need to add new fragment and cannot
+					 * do this because interface is non-SG,
+					 * or because all the page slots are
+					 * busy. */
+					sdp_mark_push(ssk, skb);
+					goto new_segment;
+				} else if (page) {
+					if (off == PAGE_SIZE) {
+						put_page(page);
+						TCP_PAGE(sk) = page = NULL;
+						off = 0;
+					}
+				} else
+					off = 0;
+
+				if (copy > PAGE_SIZE - off)
+					copy = PAGE_SIZE - off;
+
+				if (!sk_stream_wmem_schedule(sk, copy))
+					goto wait_for_memory;
+
+				if (!page) {
+					/* Allocate new cache page. */
+					if (!(page = sk_stream_alloc_page(sk)))
+						goto wait_for_memory;
+				}
+
+				/* Time to copy data. We are close to
+				 * the end! */
+				err = skb_copy_to_page(sk, from, skb, page,
+						       off, copy);
+				if (err) {
+					/* If this page was new, give it to the
+					 * socket so it does not get leaked.
+					 */
+					if (!TCP_PAGE(sk)) {
+						TCP_PAGE(sk) = page;
+						TCP_OFF(sk) = 0;
+					}
+					goto do_error;
+				}
+
+				/* Update the skb. */
+				if (merge) {
+					skb_shinfo(skb)->frags[i - 1].size +=
+									copy;
+				} else {
+					skb_fill_page_desc(skb, i, page, off, copy);
+					if (TCP_PAGE(sk)) {
+						get_page(page);
+					} else if (off + copy < PAGE_SIZE) {
+						get_page(page);
+						TCP_PAGE(sk) = page;
+					}
+				}
+
+				TCP_OFF(sk) = off + copy;
+			}
+
+			if (!copied)
+				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+			ssk->write_seq += copy;
+			TCP_SKB_CB(skb)->end_seq += copy;
+
+			from += copy;
+			copied += copy;
+			if ((seglen -= copy) == 0 && iovlen == 0)
+				goto out;
+
+			if (skb->len < mss_now || (flags & MSG_OOB))
+				continue;
+
+			if (forced_push(ssk)) {
+				sdp_mark_push(ssk, skb);
+				/* TODO: and push pending frames mss_now */
+				/* sdp_push_pending(sk, ssk, mss_now, TCP_NAGLE_PUSH); */
+			} else if (skb == sk->sk_send_head)
+				sdp_push_one(sk, mss_now);
+			continue;
+
+wait_for_sndbuf:
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+			if (copied)
+				sdp_push(sk, ssk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+				goto do_error;
+
+			mss_now = sdp_current_mss(sk, !(flags&MSG_OOB));
+			size_goal = ssk->xmit_size_goal;
+		}
+	}
+
+out:
+	if (copied)
+		sdp_push(sk, ssk, flags, mss_now, ssk->nonagle);
+	release_sock(sk);
+	return copied;
+
+do_fault:
+	if (!skb->len) {
+		if (sk->sk_send_head == skb)
+			sk->sk_send_head = NULL;
+		__skb_unlink(skb, &sk->sk_write_queue);
+		sk_stream_free_skb(sk, skb);
+	}
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	err = sk_stream_error(sk, flags, err);
+	release_sock(sk);
+	return err;
+}
+
+/* Like tcp_recvmsg */
+/* Maybe use skb_recv_datagram here? */
+/* Note this does not seem to handle vectored messages. Relevant? */
+static int sdp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+					size_t len, int noblock, int flags, 
+					int *addr_len)
+{
+	struct sk_buff *skb = NULL;
+	long timeo;
+	int target;
+	unsigned long used;
+	int err;
+	int offset = sdp_sk(sk)->offset;
+	int copied = 0;
+	int urg_data = 0;
+
+	lock_sock(sk);
+	sdp_dbg(sk, "%s\n", __func__);
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	timeo = sock_rcvtimeo(sk, noblock);
+	/* Urgent data needs to be handled specially. */
+	if (flags & MSG_OOB)
+		goto recv_urg;
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+	do {
+
+                /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+                if (urg_data) {
+                        if (copied)
+                                break;
+                        if (signal_pending(current)) {
+                                copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+                                break;
+                        }
+                }
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb) {
+			if (skb->h.raw[0] == SDP_MID_DISCONN)
+				goto found_fin_ok;
+			goto found_ok_skb;
+		}
+
+		if (copied >= target)
+			break;
+
+		if (copied) {
+			if (sk->sk_err ||
+			    sk->sk_state == TCP_CLOSE ||
+			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    !timeo ||
+			    signal_pending(current) ||
+			    (flags & MSG_PEEK))
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+
+			if (sk->sk_err) {
+				copied = sock_error(sk);
+				break;
+			}
+
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+
+			if (sk->sk_state == TCP_CLOSE) {
+				if (!sock_flag(sk, SOCK_DONE)) {
+					/* This occurs when user tries to read
+					 * from never connected socket.
+					 */
+					copied = -ENOTCONN;
+					break;
+				}
+				break;
+			}
+
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+
+			if (signal_pending(current)) {
+				copied = sock_intr_errno(timeo);
+				break;
+			}
+		}
+
+		if (copied >= target) {
+			/* Do not sleep, just process backlog. */
+			release_sock(sk);
+			lock_sock(sk);
+		} else {
+			sdp_dbg(sk, "%s: sk_wait_data %ld\n", __func__, timeo);
+			sk_wait_data(sk, &timeo);
+		}
+		continue;
+
+	found_ok_skb:
+		sdp_dbg(sk, "%s: found_ok_skb len %d\n", __func__, skb->len);
+		sdp_dbg(sk, "%s: len %Zd offset %d\n", __func__, len, offset);
+		sdp_dbg(sk, "%s: copied %d target %d\n", __func__, copied, target);
+		urg_data = sdp_has_urgent_data(skb);
+		used = skb->len - offset;
+		if (len < used)
+		       	used = len;
+
+		sdp_dbg(sk, "%s: used %ld\n", __func__, used);
+
+		if (!(flags & MSG_TRUNC)) {
+			int err;
+			err = skb_copy_datagram_iovec(skb, offset,
+						      /* TODO: skip header? */
+						      msg->msg_iov, used);
+			if (err) {
+				sdp_dbg(sk, "%s: skb_copy_datagram_iovec failed"
+					"offset %d size %ld status %d\n",
+					__func__, offset, used, err);
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		}
+
+                copied += used;
+                len -= used;
+		offset += used;
+		sdp_dbg(sk, "%s: done copied %d target %d\n", __func__, copied, target);
+
+		sdp_rcv_space_adjust(sk);
+
+                if (offset < skb->len)
+			continue; /* TODO: break? */
+
+                if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb, 0);
+
+		offset = 0;
+		skb = NULL;
+
+		continue;
+found_fin_ok:
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb, 0);
+
+		offset = 0;
+		skb = NULL;
+		break;
+	} while (len > 0);
+
+	sdp_sk(sk)->offset = skb && !(flags & MSG_PEEK) ? offset : 0;
+
+	release_sock(sk);
+	return copied;
+
+out:
+	release_sock(sk);
+	return err;
+
+recv_urg:
+	err = sdp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+	goto out;
+}
+
+static int sdp_listen(struct sock *sk, int backlog)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	int rc;
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	if (!ssk->id) {
+		rc = sdp_get_port(sk, 0);
+		if (rc)
+			return rc;
+	}
+
+	rc = rdma_listen(ssk->id, backlog);
+	if (rc) {
+		sdp_warn(sk, "rdma_listen failed: %d\n", rc);
+		sdp_set_error(sk, rc);
+	} else
+		sk->sk_state = TCP_LISTEN;
+	return rc;
+}
+
+/* We almost could use inet_listen, but that calls
+   inet_csk_listen_start. Longer term we'll want to add
+   a listen callback to struct proto, similiar to bind. */
+int sdp_inet_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	unsigned char old_state;
+	int err;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (sock->state != SS_UNCONNECTED)
+		goto out;
+
+	old_state = sk->sk_state;
+	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+		goto out;
+
+	/* Really, if the socket is already in listen state
+	 * we can only allow the backlog to be adjusted.
+	 */
+	if (old_state != TCP_LISTEN) {
+		err = sdp_listen(sk, backlog);
+		if (err)
+			goto out;
+	}
+	sk->sk_max_ack_backlog = backlog;
+	err = 0;
+
+out:
+	release_sock(sk);
+	return err;
+}
+
+static void sdp_unhash(struct sock *sk)
+{
+        sdp_dbg(sk, "%s\n", __func__);
+}
+
+static inline unsigned int sdp_listen_poll(const struct sock *sk)
+{
+	        return !list_empty(&sdp_sk(sk)->accept_queue) ?
+			(POLLIN | POLLRDNORM) : 0;
+}
+
+static unsigned int sdp_poll(struct file *file, struct socket *socket,
+			     struct poll_table_struct *wait)
+{
+	int mask;
+	sdp_dbg(socket->sk, "%s\n", __func__);
+
+	mask = datagram_poll(file, socket, wait);
+	/* TODO: Slightly ugly: it would be nicer if there was function
+	 * like datagram_poll that didn't include poll_wait,
+	 * then we could reverse the order. */
+	if (socket->sk->sk_state == TCP_LISTEN)
+		return sdp_listen_poll(socket->sk);
+
+	if (sdp_sk(socket->sk)->urg_data & TCP_URG_VALID)
+		mask |= POLLPRI;
+	return mask;
+}
+
+static void sdp_enter_memory_pressure(void)
+{
+	sdp_dbg(NULL, "%s\n", __func__);
+}
+
+static atomic_t sockets_allocated;
+static atomic_t memory_allocated;
+static atomic_t orphan_count;
+static int memory_pressure;
+struct proto sdp_proto = {
+        .close       = sdp_close,
+        .connect     = sdp_connect,
+        .disconnect  = sdp_disconnect,
+        .accept      = sdp_accept,
+        .ioctl       = sdp_ioctl,
+        .init        = sdp_init_sock,
+        .shutdown    = sdp_shutdown,
+        .setsockopt  = sdp_setsockopt,
+        .getsockopt  = sdp_getsockopt,
+        .sendmsg     = sdp_sendmsg,
+        .recvmsg     = sdp_recvmsg,
+	.unhash      = sdp_unhash,
+        .get_port    = sdp_get_port,
+	/* Wish we had this: .listen   = sdp_listen */
+	.enter_memory_pressure = sdp_enter_memory_pressure,
+	.sockets_allocated = &sockets_allocated,
+	.memory_allocated = &memory_allocated,
+	.memory_pressure = &memory_pressure,
+	.orphan_count = &orphan_count,
+        .sysctl_mem             = sysctl_tcp_mem,
+        .sysctl_wmem            = sysctl_tcp_wmem,
+        .sysctl_rmem            = sysctl_tcp_rmem,
+	.max_header  = sizeof(struct sdp_bsdh),
+        .obj_size    = sizeof(struct sdp_sock),
+	.owner	     = THIS_MODULE,
+	.name	     = "SDP",
+};
+
+static struct proto_ops sdp_proto_ops = {
+	.family     = PF_INET,
+	.owner      = THIS_MODULE,
+	.release    = inet_release,
+	.bind       = inet_bind,
+	.connect    = inet_stream_connect, /* TODO: inet_datagram connect would
+					      autobind, but need to fix get_port
+					      with port 0 first. */
+	.socketpair = sock_no_socketpair,
+	.accept     = inet_accept,
+	.getname    = inet_getname,
+	.poll       = sdp_poll,
+	.ioctl      = inet_ioctl,
+	.listen     = sdp_inet_listen,
+	.shutdown   = inet_shutdown,
+	.setsockopt = sock_common_setsockopt,
+	.getsockopt = sock_common_getsockopt,
+	.sendmsg    = inet_sendmsg,
+	.recvmsg    = sock_common_recvmsg,
+	.mmap       = sock_no_mmap,
+	.sendpage   = sock_no_sendpage,
+};
+
+static int sdp_create_socket(struct socket *sock, int protocol)
+{
+	struct sock *sk;
+	int rc;
+
+	sdp_dbg(NULL, "%s: type %d protocol %d\n", __func__, sock->type, protocol);
+
+	if (sock->type != SOCK_STREAM) {
+		sdp_warn(NULL, "SDP: unsupported type %d.\n", sock->type);
+		return -ESOCKTNOSUPPORT;
+	}
+
+	/* IPPROTO_IP is a wildcard match */
+	if (protocol != IPPROTO_TCP && protocol != IPPROTO_IP) {
+		sdp_warn(NULL, "SDP: unsupported protocol %d.\n", protocol);
+		return -EPROTONOSUPPORT;
+	}
+
+	sk = sk_alloc(PF_INET_SDP, GFP_KERNEL, &sdp_proto, 1);
+	if (!sk) {
+		sdp_warn(NULL, "SDP: failed to allocate socket.\n");
+		return -ENOMEM;
+	}
+	sock_init_data(sock, sk);
+	sk->sk_protocol = 0x0 /* TODO: inherit tcp socket to use IPPROTO_TCP */;
+
+	rc = sdp_init_sock(sk);
+	if (rc) {
+		sdp_warn(sk, "SDP: failed to init sock.\n");
+		sk_common_release(sk);
+		return -ENOMEM;
+	}
+
+	sk->sk_destruct = sdp_destruct;
+
+	sock->ops = &sdp_proto_ops;
+	sock->state = SS_UNCONNECTED;
+	return 0;
+}
+
+static struct net_proto_family sdp_net_proto = {
+	.family = AF_INET_SDP,
+	.create = sdp_create_socket,
+	.owner  = THIS_MODULE,
+};
+
+static int __init sdp_init(void)
+{
+	int rc;
+
+	sdp_workqueue = create_singlethread_workqueue("sdp");
+	if (!sdp_workqueue) {
+		return -ENOMEM;
+	}
+
+	rc = proto_register(&sdp_proto, 1);
+	if (rc) {
+		printk(KERN_WARNING "%s: proto_register failed: %d\n", __func__, rc);
+		destroy_workqueue(sdp_workqueue);
+		return rc;
+	}
+
+	rc = sock_register(&sdp_net_proto);
+	if (rc) {
+		printk(KERN_WARNING "%s: sock_register failed: %d\n", __func__, rc);
+		proto_unregister(&sdp_proto);
+		destroy_workqueue(sdp_workqueue);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void __exit sdp_exit(void)
+{
+	sock_unregister(PF_INET_SDP);
+	proto_unregister(&sdp_proto);
+
+	if (atomic_read(&orphan_count))
+		printk(KERN_WARNING "%s: orphan_count %d\n", __func__,
+		       atomic_read(&orphan_count));
+	destroy_workqueue(sdp_workqueue);
+	flush_scheduled_work();
+}
+
+module_init(sdp_init);
+module_exit(sdp_exit);
diff -uprN linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp_socket.h linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp_socket.h
--- linux-2.6.17.noarch.orig/drivers/infiniband/ulp/sdp/sdp_socket.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.17.noarch.sdp/drivers/infiniband/ulp/sdp/sdp_socket.h	2006-07-25 22:31:03.000000000 -0400
@@ -0,0 +1,13 @@
+/* Stuff that should go into include/linux/socket.h */
+
+#ifndef SDP_SOCKET_H
+#define SDP_SOCKET_H
+
+#ifndef AF_INET_SDP
+#define AF_INET_SDP 27
+#define PF_INET_SDP AF_INET_SDP
+#endif
+
+/* TODO: AF_INET6_SDP ? */
+
+#endif