Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 3190

kernel-2.6.18-238.el5.src.rpm

From: Herbert Xu <herbert.xu@redhat.com>
Date: Wed, 10 Sep 2008 01:09:17 +1000
Subject: [net] tun: add IFF_VNET_HDR, TUNGETFEATURES, TUNGETIFF
Message-id: 20080909150917.GA30451@gondor.apana.org.au
O-Subject: Re: [RHEL5.3 PATCH] tun: Add IFF_VNET_HDR, TUNGETFEATURES, TUNGETIFF and TUNSETOFFLOAD
Bugzilla: 459719
RH-Acked-by: Rik van Riel <riel@redhat.com>
RH-Acked-by: Neil Horman <nhorman@redhat.com>
RH-Acked-by: David Miller <davem@redhat.com>
RH-Acked-by: Mark McLoughlin <markmc@redhat.com>

On Thu, Aug 21, 2008 at 04:29:26PM +0100, Mark McLoughlin wrote:
> https://bugzilla.redhat.com/459719

Here's a respin of Mark's patch with the proto_csum_blank stuff
replaces with explicit checksumming.  Apart from the kABI issues,
the proto_csum_blank doesn't fully work for virtio because unlike
Xen it supports IPv6 checksum offload which proto_csum_blank does
not handle at all.

In any case, as we have to do a copy in the tunnel driver we can
calculate the checksum at the same time for almost no cost at all.

Unfortunately I haven't been able to test this yet because I keep
hitting a prefetch crash in udp_v4_get_port on my x86-64 laptop
with the RHEL5 kernel.

Until I track that down, I'd very much apppreciate it if someone
else could run this to make sure that it still spits out the correct
checksum.

tun: Backport a number of tun/tap APIs recently added upstream.

Together these allow KVM to enable GSO in guests using the virtio_net
paravirt driver, because packets from the guest can be passed to the
host, along with metadata to describe whether packets have partial
checksums and/or need to be segmented via GSO.

This provides a significant improvement in the throughput achievable
by KVM guests.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 329d9fe..671bb39 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -58,6 +58,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_tun.h>
 #include <linux/crc32.h>
+#include <linux/virtio_net.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -229,6 +230,7 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
 	struct sk_buff *skb;
 	size_t len = count, align = 0;
+	struct virtio_net_hdr gso = { 0 };
 
 	if (!(tun->flags & TUN_NO_PI)) {
 		if ((len -= sizeof(pi)) > count)
@@ -238,6 +240,17 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 			return -EFAULT;
 	}
 
+	if (tun->flags & TUN_VNET_HDR) {
+		if ((len -= sizeof(gso)) > count)
+			return -EINVAL;
+
+		if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso)))
+			return -EFAULT;
+
+		if (gso.hdr_len > len)
+			return -EINVAL;
+	}
+
 	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV)
 		align = NET_IP_ALIGN;
  
@@ -248,10 +261,49 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 
 	if (align)
 		skb_reserve(skb, align);
-	if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
-		tun->stats.rx_dropped++;
-		kfree_skb(skb);
-		return -EFAULT;
+
+	skb_put(skb, len);
+
+	if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		unsigned int csum = 0;
+
+		if (gso.csum_start + gso.csum_offset > len - 2) {
+			if (net_ratelimit())
+				printk(KERN_WARNING
+				       "bad partial csum: csum=%u/%u len=%u\n",
+				       gso.csum_start, gso.csum_offset, len);
+			tun->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+
+		if (memcpy_fromiovec(skb->data, iv, gso.csum_start)) {
+			tun->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EFAULT;
+		}
+
+		if (csum_partial_copy_fromiovecend(skb->data + gso.csum_start,
+						   iv, 0, len - gso.csum_start,
+						   &csum)) {
+			tun->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EFAULT;
+		}
+
+		*(u16 *)(skb->data + gso.csum_start + gso.csum_offset) =
+			csum_fold(csum);
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else {
+		if (memcpy_fromiovec(skb->data, iv, len)) {
+			tun->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EFAULT;
+		}
+
+		if (tun->flags & TUN_NOCHECKSUM)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
 	skb->dev = tun->dev;
@@ -265,9 +317,36 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 		break;
 	};
 
-	if (tun->flags & TUN_NOCHECKSUM)
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
- 
+	if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		pr_debug("GSO!\n");
+		switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+		case VIRTIO_NET_HDR_GSO_TCPV4:
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+			break;
+		case VIRTIO_NET_HDR_GSO_TCPV6:
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+			break;
+		default:
+			tun->stats.rx_frame_errors++;
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+
+		if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+		skb_shinfo(skb)->gso_size = gso.gso_size;
+		if (skb_shinfo(skb)->gso_size == 0) {
+			tun->stats.rx_frame_errors++;
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+
+		/* Header must be checked, and gso_segs computed. */
+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+		skb_shinfo(skb)->gso_segs = 0;
+	}
+
 	netif_rx_ni(skb);
 	tun->dev->last_rx = jiffies;
    
@@ -332,6 +411,39 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 		total += sizeof(pi);
 	}       
 
+	if (tun->flags & TUN_VNET_HDR) {
+		struct virtio_net_hdr gso = { 0 }; /* no info leak */
+		if ((len -= sizeof(gso)) < 0)
+			return -EINVAL;
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			gso.hdr_len = skb_headlen(skb);
+			gso.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			gso.csum_start = skb->h.raw - skb->data;
+			gso.csum_offset = skb->csum;
+		} /* else everything is zero */
+
+		if (unlikely(memcpy_toiovec(iv, (void *)&gso, sizeof(gso))))
+			return -EFAULT;
+		total += sizeof(gso);
+	}
+
 	len = min_t(int, skb->len, len);
 
 	skb_copy_datagram_iovec(skb, 0, iv, len);
@@ -545,6 +657,11 @@ static int tun_set_iff(struct file *file, struct ifreq *ifr)
 	if (ifr->ifr_flags & IFF_ONE_QUEUE)
 		tun->flags |= TUN_ONE_QUEUE;
 
+	if (ifr->ifr_flags & IFF_VNET_HDR)
+		tun->flags |= TUN_VNET_HDR;
+	else
+		tun->flags &= ~TUN_VNET_HDR;
+
 	file->private_data = tun;
 	tun->attached = 1;
 
@@ -557,12 +674,83 @@ static int tun_set_iff(struct file *file, struct ifreq *ifr)
 	return err;
 }
 
+static int tun_get_iff(struct file *file, struct ifreq *ifr)
+{
+	struct tun_struct *tun = file->private_data;
+
+	if (!tun)
+		return -EBADFD;
+
+	DBG(KERN_INFO "%s: tun_get_iff\n", tun->dev->name);
+
+	strcpy(ifr->ifr_name, tun->dev->name);
+
+	ifr->ifr_flags = 0;
+
+	if (ifr->ifr_flags & TUN_TUN_DEV)
+		ifr->ifr_flags |= IFF_TUN;
+	else
+		ifr->ifr_flags |= IFF_TAP;
+
+	if (tun->flags & TUN_NO_PI)
+		ifr->ifr_flags |= IFF_NO_PI;
+
+	if (tun->flags & TUN_ONE_QUEUE)
+		ifr->ifr_flags |= IFF_ONE_QUEUE;
+
+	if (tun->flags & TUN_VNET_HDR)
+		ifr->ifr_flags |= IFF_VNET_HDR;
+
+	return 0;
+}
+
+/* This is like a cut-down ethtool ops, except done via tun fd so no
+ * privs required. */
+static int set_offload(struct net_device *dev, unsigned long arg)
+{
+	unsigned int old_features, features;
+
+	old_features = dev->features;
+	/* Unset features, set them as we chew on the arg. */
+	features = (old_features & ~(NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST
+				    |NETIF_F_TSO_ECN|NETIF_F_TSO|NETIF_F_TSO6));
+
+	if (arg & TUN_F_CSUM) {
+		features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
+		arg &= ~TUN_F_CSUM;
+
+		if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
+			if (arg & TUN_F_TSO_ECN) {
+				features |= NETIF_F_TSO_ECN;
+				arg &= ~TUN_F_TSO_ECN;
+			}
+			if (arg & TUN_F_TSO4)
+				features |= NETIF_F_TSO;
+			if (arg & TUN_F_TSO6)
+				features |= NETIF_F_TSO6;
+			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
+		}
+	}
+
+	/* This gives the user a way to test for new features in future by
+	 * trying to set them. */
+	if (arg)
+		return -EINVAL;
+
+	dev->features = features;
+	if (old_features != dev->features)
+		netdev_features_change(dev);
+
+	return 0;
+}
+
 static int tun_chr_ioctl(struct inode *inode, struct file *file, 
 			 unsigned int cmd, unsigned long arg)
 {
 	struct tun_struct *tun = file->private_data;
 	void __user* argp = (void __user*)arg;
 	struct ifreq ifr;
+	int ret;
 
 	if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
 		if (copy_from_user(&ifr, argp, sizeof ifr))
@@ -585,12 +773,30 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
 		return 0;
 	}
 
+	if (cmd == TUNGETFEATURES) {
+		/* Currently this just means: "what IFF flags are valid?".
+		 * This is needed because we never checked for invalid flags on
+		 * TUNSETIFF. */
+		return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
+				IFF_VNET_HDR,
+				(unsigned int __user*)argp);
+	}
+
 	if (!tun)
 		return -EBADFD;
 
 	DBG(KERN_INFO "%s: tun_chr_ioctl cmd %d\n", tun->dev->name, cmd);
 
 	switch (cmd) {
+	case TUNGETIFF:
+		ret = tun_get_iff(file, &ifr);
+		if (ret)
+			return ret;
+
+		if (copy_to_user(argp, &ifr, sizeof(ifr)))
+			return -EFAULT;
+		break;
+
 	case TUNSETNOCSUM:
 		/* Disable/Enable checksum */
 		if (arg)
@@ -638,6 +844,12 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
 		break;
 #endif
 
+	case TUNSETOFFLOAD:
+		rtnl_lock();
+		ret = set_offload(tun->dev, arg);
+		rtnl_unlock();
+		return ret;
+
 	case SIOCGIFFLAGS:
 		ifr.ifr_flags = tun->if_flags;
 		if (copy_to_user( argp, &ifr, sizeof ifr))
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 88aef7b..6860229 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -70,6 +70,7 @@ struct tun_struct {
 #define TUN_NO_PI	0x0040
 #define TUN_ONE_QUEUE	0x0080
 #define TUN_PERSIST 	0x0100	
+#define TUN_VNET_HDR 	0x0200
 
 /* Ioctl defines */
 #define TUNSETNOCSUM  _IOW('T', 200, int) 
@@ -78,12 +79,22 @@ struct tun_struct {
 #define TUNSETPERSIST _IOW('T', 203, int) 
 #define TUNSETOWNER   _IOW('T', 204, int)
 #define TUNSETLINK    _IOW('T', 205, int)
+#define TUNGETFEATURES _IOR('T', 207, unsigned int)
+#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
+#define TUNGETIFF      _IOR('T', 210, unsigned int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
 #define IFF_NO_PI	0x1000
 #define IFF_ONE_QUEUE	0x2000
+#define IFF_VNET_HDR	0x4000
+
+/* Features for GSO (TUNSETOFFLOAD). */
+#define TUN_F_CSUM	0x01	/* You can hand me unchecksummed packets. */
+#define TUN_F_TSO4	0x02	/* I can handle TSO for IPv4 packets */
+#define TUN_F_TSO6	0x04	/* I can handle TSO for IPv6 packets */
+#define TUN_F_TSO_ECN	0x08	/* I can handle TSO with ECN bits. */
 
 struct tun_pi {
 	unsigned short flags;