Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > media > main-src > by-pkgid > aadbe78a25743146bb784eee19f007c5 > files > 478

kvm-83-164.el5_5.9.src.rpm

From 38a1e6fb3293f47698209077a8d4907d7b8e7685 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 30 Jun 2009 11:01:17 +0100
Subject: [PATCH] net: add '-net tap,sndbuf=' option with a sensible default

https://bugzilla.redhat.com/508861

In a bridged configuration, we should avoid overflowing the physical
NIC's TX queue and dropping packets.

The 5.4 kernel adds a new TUNSETSNDBUF ioctl() which allows a send
buffer limit for the tap device to be specified. When this limit is
reached, a tap write() will return EAGAIN and poll() will indicate the fd
isn't writable. See:

  https://bugzilla.redhat.com/495863

This allows people to tune their setups so as to avoid e.g. UDP packet
loss when the sending application in the guest out-runs the NIC in the
host.

For best effect, sndbuf= should be set to just below the capacity of
the physical NIC.

Setting it higher will cause packets to be dropped before the limit
is hit. Setting it much lower will not cause any problems unless
you set it low enough such that the guest cannot queue up new packets
before the NIC has emptied its queue.

In Linux, txqueuelen=1000 by default for ethernet NICs. Given a 1500
byte MTU, 1Mb is a good choice for sndbuf.

If it turns out that txqueuelen is actually much lower than this, then
sndbuf is essentially disabled. In the event that txqueuelen is much
higher, it's unlikely that the NIC will be able to empty a 1Mb queue.

Also, note that when using a bridge with netfilter enabled, we currently
never get EAGAIN because netfilter causes the packet to be immediately
orphaned. Set /proc/sys/net/bridge/bridge nf-call-iptables to zero to
disable this behaviour.

The sndbuf= parameter is upstream as of this commit:

  http://git.savannah.gnu.org/cgit/qemu.git/commit/?id=0df0ff6de7

I've just now sent a patch upstream to set a default value.

The proposed patch for 5.4 isn't a simple cherry-pick of the code from
upstream, as upstream has an internal API for buffering which would have
been too invasive to backport.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Message-Id: <1246356077.3749.34.camel@blaa>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Bugzilla: 508861
RH-Upstream-status: backported-from-upstream
Acked-by: Herbert Xu <herbert.xu@redhat.com>
Acked-by: Juan Quintela <quintela@redhat.com>
Acked-by: john cooper <john.cooper@redhat.com>
Acked-by: "Michael S. Tsirkin" <mst@redhat.com>
---
 qemu/hw/virtio-net.c |   47 ++++++++++++++++++-
 qemu/net.c           |  126 +++++++++++++++++++++++++++++++++++++++++++------
 qemu/net.h           |    1 +
 3 files changed, 156 insertions(+), 18 deletions(-)

diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
index 54a965e..58e59f5 100644
--- a/qemu/hw/virtio-net.c
+++ b/qemu/hw/virtio-net.c
@@ -31,6 +31,12 @@ typedef struct VirtIONet
     VLANClientState *vc;
     QEMUTimer *tx_timer;
     int tx_timer_active;
+    struct {
+        VirtQueueElement elem;
+        ssize_t len;
+        struct iovec *out_sg;
+        unsigned int out_num;
+    } tx_queue;
     int mergeable_rx_bufs;
 } VirtIONet;
 
@@ -331,11 +337,17 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
     if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
         return;
 
+    if (n->tx_queue.out_num) {
+        virtio_queue_set_notification(n->tx_vq, 0);
+        return;
+    }
+
     while (virtqueue_pop(vq, &elem)) {
         ssize_t len = 0;
         unsigned int out_num = elem.out_num;
         struct iovec *out_sg = &elem.out_sg[0];
         unsigned hdr_len;
+        int ret;
 
         /* hdr_len refers to the header received from the guest */
         hdr_len = n->mergeable_rx_bufs ?
@@ -359,9 +371,17 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
             len += hdr_len;
         }
 
-        len += qemu_sendv_packet(n->vc, out_sg, out_num);
+        ret = qemu_sendv_packet(n->vc, out_sg, out_num);
+        if (ret == -EAGAIN) {
+            virtio_queue_set_notification(n->tx_vq, 0);
+            n->tx_queue.elem = elem;
+            n->tx_queue.len = len;
+            n->tx_queue.out_sg = out_sg;
+            n->tx_queue.out_num = out_num;
+            return;
+        }
 
-        virtqueue_push(vq, &elem, len);
+        virtqueue_push(vq, &elem, len + ret);
         virtio_notify(&n->vdev, vq);
     }
 }
@@ -397,6 +417,28 @@ static void virtio_net_tx_timer(void *opaque)
     virtio_net_flush_tx(n, n->tx_vq);
 }
 
+static void virtio_net_retry_tx(void *opaque)
+{
+    VirtIONet *n = opaque;
+
+    if (n->tx_queue.out_num) {
+        int ret;
+
+        ret = qemu_sendv_packet(n->vc, n->tx_queue.out_sg, n->tx_queue.out_num);
+        if (ret == -EAGAIN) {
+            return;
+        }
+
+        virtqueue_push(n->tx_vq, &n->tx_queue.elem, n->tx_queue.len + ret);
+        virtio_notify(&n->vdev, n->tx_vq);
+
+        n->tx_queue.out_num = 0;
+    }
+
+    virtio_queue_set_notification(n->tx_vq, 1);
+    virtio_net_flush_tx(n, n->tx_vq);
+}
+
 static void virtio_net_save(QEMUFile *f, void *opaque)
 {
     VirtIONet *n = opaque;
@@ -480,6 +522,7 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
     n->vc->cleanup = virtio_net_cleanup;
     n->vc->link_status_changed = virtio_net_set_link_status;
     n->vc->fd_read_raw = virtio_net_receive_raw;
+    n->vc->fd_writable = virtio_net_retry_tx;
 
     qemu_format_nic_info_str(n->vc, n->mac);
 
diff --git a/qemu/net.c b/qemu/net.c
index 6cc9bc5..5bfbc79 100644
--- a/qemu/net.c
+++ b/qemu/net.c
@@ -496,7 +496,7 @@ ssize_t qemu_sendv_packet(VLANClientState *vc1, const struct iovec *iov,
 {
     VLANState *vlan = vc1->vlan;
     VLANClientState *vc;
-    ssize_t max_len = 0;
+    ssize_t max_len = -EAGAIN;
 
     if (vc1->link_down)
         return calc_iov_length(iov, iovcnt);
@@ -752,8 +752,55 @@ typedef struct TAPState {
     int size;
     unsigned int has_vnet_hdr : 1;
     unsigned int using_vnet_hdr : 1;
+    unsigned int read_poll : 1;
+    unsigned int write_poll : 1;
 } TAPState;
 
+static int tap_can_send(void *opaque);
+static void tap_send(void *opaque);
+static void tap_writable(void *opaque);
+
+static void tap_update_fd_handler(TAPState *s)
+{
+    qemu_set_fd_handler2(s->fd,
+                         s->read_poll  ? tap_can_send : NULL,
+                         s->read_poll  ? tap_send     : NULL,
+                         s->write_poll ? tap_writable : NULL,
+                         s);
+}
+
+static void tap_read_poll(TAPState *s, int enable)
+{
+    s->read_poll = !!enable;
+    tap_update_fd_handler(s);
+}
+
+static void tap_write_poll(TAPState *s, int enable)
+{
+    s->write_poll = !!enable;
+    tap_update_fd_handler(s);
+}
+
+static void tap_writable(void *opaque)
+{
+    TAPState *s = opaque;
+    VLANClientState *vc;
+
+    tap_write_poll(s, 0);
+
+    for (vc = s->vc->vlan->first_client; vc; vc = vc->next) {
+        if (vc == s->vc) {
+            continue;
+        }
+
+        if (!vc->fd_writable) {
+            continue;
+        }
+
+        vc->fd_writable(vc->opaque);
+    }
+}
+
 #ifdef HAVE_IOVEC
 static ssize_t tap_receive_iov(void *opaque, const struct iovec *iov,
                                int iovcnt)
@@ -763,7 +810,12 @@ static ssize_t tap_receive_iov(void *opaque, const struct iovec *iov,
 
     do {
         len = writev(s->fd, iov, iovcnt);
-    } while (len == -1 && (errno == EINTR || errno == EAGAIN));
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1 && errno == EAGAIN) {
+        tap_write_poll(s, 1);
+        return -EAGAIN;
+    }
 
     return len;
 }
@@ -956,6 +1008,39 @@ static void tap_set_offload(VLANClientState *vc, int csum, int tso4, int tso6,
 }
 #endif /* TUNSETOFFLOAD */
 
+#ifdef TUNSETSNDBUF
+/* sndbuf should be set to a value lower than the tx queue
+ * capacity of any destination network interface.
+ * Ethernet NICs generally have txqueuelen=1000, so 1Mb is
+ * a good default, given a 1500 byte MTU.
+ */
+#define TAP_DEFAULT_SNDBUF 1024*1024
+
+static void tap_set_sndbuf(TAPState *s, const char *sndbuf_str)
+{
+    int sndbuf = TAP_DEFAULT_SNDBUF;
+
+    if (sndbuf_str) {
+        sndbuf = atoi(sndbuf_str);
+    }
+
+    if (!sndbuf) {
+        sndbuf = INT_MAX;
+    }
+
+    if (ioctl(s->fd, TUNSETSNDBUF, &sndbuf) == -1) {
+        fprintf(stderr, "TUNSETSNDBUF ioctl failed: %s\n", strerror(errno));
+    }
+}
+#else
+static void tap_set_sndbuf(TAPState *s, const char *sndbuf_str)
+{
+    if (sndbuf_str) {
+        fprintf(stderr, "No '-net tap,sndbuf=<nbytes>' support available\n");
+    }
+}
+#endif /* TUNSETSNDBUF */
+
 static int launch_script(const char *setup_script, const char *ifname, int fd);
 
 static void tap_cleanup(VLANClientState *vc)
@@ -965,7 +1050,8 @@ static void tap_cleanup(VLANClientState *vc)
     if (s->down_script[0])
         launch_script(s->down_script, s->down_script_arg, s->fd);
 
-    qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
+    tap_read_poll(s, 0);
+    tap_write_poll(s, 0);
     close(s->fd);
     qemu_free(s);
 }
@@ -994,7 +1080,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan,
 #ifdef TUNSETOFFLOAD
     s->vc->set_offload = tap_set_offload;
 #endif
-    qemu_set_fd_handler2(s->fd, tap_can_send, tap_send, NULL, s);
+    tap_read_poll(s, 1);
     snprintf(s->vc->info_str, sizeof(s->vc->info_str), "fd=%d", fd);
     return s;
 }
@@ -1238,9 +1324,9 @@ static int launch_script(const char *setup_script, const char *ifname, int fd)
     return 0;
 }
 
-static int net_tap_init(VLANState *vlan, const char *model,
-                        const char *name, const char *ifname1,
-                        const char *setup_script, const char *down_script)
+static TAPState *net_tap_init(VLANState *vlan, const char *model,
+                              const char *name, const char *ifname1,
+                              const char *setup_script, const char *down_script)
 {
     TAPState *s;
     int fd;
@@ -1254,17 +1340,17 @@ static int net_tap_init(VLANState *vlan, const char *model,
     vnet_hdr = 0;
     TFR(fd = tap_open(ifname, sizeof(ifname), &vnet_hdr));
     if (fd < 0)
-        return -1;
+        return NULL;
 
     if (!setup_script || !strcmp(setup_script, "no"))
         setup_script = "";
     if (setup_script[0] != '\0') {
 	if (launch_script(setup_script, ifname, fd))
-	    return -1;
+	    return NULL;
     }
     s = net_tap_fd_init(vlan, model, name, fd, vnet_hdr);
     if (!s)
-        return -1;
+        return NULL;
 
     snprintf(s->vc->info_str, sizeof(s->vc->info_str),
              "ifname=%s,script=%s,downscript=%s",
@@ -1273,7 +1359,7 @@ static int net_tap_init(VLANState *vlan, const char *model,
         snprintf(s->down_script, sizeof(s->down_script), "%s", down_script);
         snprintf(s->down_script_arg, sizeof(s->down_script_arg), "%s", ifname);
     }
-    return 0;
+    return s;
 }
 
 #endif /* !_WIN32 */
@@ -1922,15 +2008,13 @@ int net_client_init(const char *device, const char *p)
     if (!strcmp(device, "tap")) {
         char ifname[64];
         char setup_script[1024], down_script[1024];
+        TAPState *s;
         int fd;
         vlan->nb_host_devs++;
         if (get_param_value(buf, sizeof(buf), "fd", p) > 0) {
             fd = strtol(buf, NULL, 0);
             fcntl(fd, F_SETFL, O_NONBLOCK);
-            ret = -1;
-                if (net_tap_fd_init(vlan, device, name, fd,
-                                    tap_probe_vnet_hdr(fd)))
-                ret = 0;
+            s = net_tap_fd_init(vlan, device, name, fd, tap_probe_vnet_hdr(fd));
         } else {
             if (get_param_value(ifname, sizeof(ifname), "ifname", p) <= 0) {
                 ifname[0] = '\0';
@@ -1941,7 +2025,17 @@ int net_client_init(const char *device, const char *p)
             if (get_param_value(down_script, sizeof(down_script), "downscript", p) == 0) {
                 pstrcpy(down_script, sizeof(down_script), DEFAULT_NETWORK_DOWN_SCRIPT);
             }
-            ret = net_tap_init(vlan, device, name, ifname, setup_script, down_script);
+            s = net_tap_init(vlan, device, name, ifname, setup_script, down_script);
+        }
+        if (s != NULL) {
+            const char *sndbuf_str = NULL;
+            if (get_param_value(buf, sizeof(buf), "sndbuf", p)) {
+                sndbuf_str = buf;
+            }
+            tap_set_sndbuf(s, sndbuf_str);
+            ret = 0;
+        } else {
+            ret = -1;
         }
     } else
 #endif
diff --git a/qemu/net.h b/qemu/net.h
index 47a2461..f6710be 100644
--- a/qemu/net.h
+++ b/qemu/net.h
@@ -20,6 +20,7 @@ struct VLANClientState {
     /* Packets may still be sent if this returns zero.  It's used to
        rate-limit the slirp code.  */
     IOCanRWHandler *fd_can_read;
+    IOHandler *fd_writable;
     NetCleanup *cleanup;
     LinkStatusChanged *link_status_changed;
     int link_down;
-- 
1.6.3.rc4.29.g8146