From 38a1e6fb3293f47698209077a8d4907d7b8e7685 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin <markmc@redhat.com> Date: Tue, 30 Jun 2009 11:01:17 +0100 Subject: [PATCH] net: add '-net tap,sndbuf=' option with a sensible default https://bugzilla.redhat.com/508861 In a bridged configuration, we should avoid overflowing the physical NIC's TX queue and dropping packets. The 5.4 kernel adds a new TUNSETSNDBUF ioctl() which allows a send buffer limit for the tap device to be specified. When this limit is reached, a tap write() will return EAGAIN and poll() will indicate the fd isn't writable. See: https://bugzilla.redhat.com/495863 This allows people to tune their setups so as to avoid e.g. UDP packet loss when the sending application in the guest out-runs the NIC in the host. For best effect, sndbuf= should be set to just below the capacity of the physical NIC. Setting it higher will cause packets to be dropped before the limit is hit. Setting it much lower will not cause any problems unless you set it low enough such that the guest cannot queue up new packets before the NIC has emptied its queue. In Linux, txqueuelen=1000 by default for ethernet NICs. Given a 1500 byte MTU, 1Mb is a good choice for sndbuf. If it turns out that txqueuelen is actually much lower than this, then sndbuf is essentially disabled. In the event that txqueuelen is much higher, it's unlikely that the NIC will be able to empty a 1Mb queue. Also, note that when using a bridge with netfilter enabled, we currently never get EAGAIN because netfilter causes the packet to be immediately orphaned. Set /proc/sys/net/bridge/bridge nf-call-iptables to zero to disable this behaviour. The sndbuf= parameter is upstream as of this commit: http://git.savannah.gnu.org/cgit/qemu.git/commit/?id=0df0ff6de7 I've just now sent a patch upstream to set a default value. The proposed patch for 5.4 isn't a simple cherry-pick of the code from upstream, as upstream has an internal API for buffering which would have been too invasive to backport. Signed-off-by: Mark McLoughlin <markmc@redhat.com> Message-Id: <1246356077.3749.34.camel@blaa> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Bugzilla: 508861 RH-Upstream-status: backported-from-upstream Acked-by: Herbert Xu <herbert.xu@redhat.com> Acked-by: Juan Quintela <quintela@redhat.com> Acked-by: john cooper <john.cooper@redhat.com> Acked-by: "Michael S. Tsirkin" <mst@redhat.com> --- qemu/hw/virtio-net.c | 47 ++++++++++++++++++- qemu/net.c | 126 +++++++++++++++++++++++++++++++++++++++++++------ qemu/net.h | 1 + 3 files changed, 156 insertions(+), 18 deletions(-) diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c index 54a965e..58e59f5 100644 --- a/qemu/hw/virtio-net.c +++ b/qemu/hw/virtio-net.c @@ -31,6 +31,12 @@ typedef struct VirtIONet VLANClientState *vc; QEMUTimer *tx_timer; int tx_timer_active; + struct { + VirtQueueElement elem; + ssize_t len; + struct iovec *out_sg; + unsigned int out_num; + } tx_queue; int mergeable_rx_bufs; } VirtIONet; @@ -331,11 +337,17 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) return; + if (n->tx_queue.out_num) { + virtio_queue_set_notification(n->tx_vq, 0); + return; + } + while (virtqueue_pop(vq, &elem)) { ssize_t len = 0; unsigned int out_num = elem.out_num; struct iovec *out_sg = &elem.out_sg[0]; unsigned hdr_len; + int ret; /* hdr_len refers to the header received from the guest */ hdr_len = n->mergeable_rx_bufs ? @@ -359,9 +371,17 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) len += hdr_len; } - len += qemu_sendv_packet(n->vc, out_sg, out_num); + ret = qemu_sendv_packet(n->vc, out_sg, out_num); + if (ret == -EAGAIN) { + virtio_queue_set_notification(n->tx_vq, 0); + n->tx_queue.elem = elem; + n->tx_queue.len = len; + n->tx_queue.out_sg = out_sg; + n->tx_queue.out_num = out_num; + return; + } - virtqueue_push(vq, &elem, len); + virtqueue_push(vq, &elem, len + ret); virtio_notify(&n->vdev, vq); } } @@ -397,6 +417,28 @@ static void virtio_net_tx_timer(void *opaque) virtio_net_flush_tx(n, n->tx_vq); } +static void virtio_net_retry_tx(void *opaque) +{ + VirtIONet *n = opaque; + + if (n->tx_queue.out_num) { + int ret; + + ret = qemu_sendv_packet(n->vc, n->tx_queue.out_sg, n->tx_queue.out_num); + if (ret == -EAGAIN) { + return; + } + + virtqueue_push(n->tx_vq, &n->tx_queue.elem, n->tx_queue.len + ret); + virtio_notify(&n->vdev, n->tx_vq); + + n->tx_queue.out_num = 0; + } + + virtio_queue_set_notification(n->tx_vq, 1); + virtio_net_flush_tx(n, n->tx_vq); +} + static void virtio_net_save(QEMUFile *f, void *opaque) { VirtIONet *n = opaque; @@ -480,6 +522,7 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn) n->vc->cleanup = virtio_net_cleanup; n->vc->link_status_changed = virtio_net_set_link_status; n->vc->fd_read_raw = virtio_net_receive_raw; + n->vc->fd_writable = virtio_net_retry_tx; qemu_format_nic_info_str(n->vc, n->mac); diff --git a/qemu/net.c b/qemu/net.c index 6cc9bc5..5bfbc79 100644 --- a/qemu/net.c +++ b/qemu/net.c @@ -496,7 +496,7 @@ ssize_t qemu_sendv_packet(VLANClientState *vc1, const struct iovec *iov, { VLANState *vlan = vc1->vlan; VLANClientState *vc; - ssize_t max_len = 0; + ssize_t max_len = -EAGAIN; if (vc1->link_down) return calc_iov_length(iov, iovcnt); @@ -752,8 +752,55 @@ typedef struct TAPState { int size; unsigned int has_vnet_hdr : 1; unsigned int using_vnet_hdr : 1; + unsigned int read_poll : 1; + unsigned int write_poll : 1; } TAPState; +static int tap_can_send(void *opaque); +static void tap_send(void *opaque); +static void tap_writable(void *opaque); + +static void tap_update_fd_handler(TAPState *s) +{ + qemu_set_fd_handler2(s->fd, + s->read_poll ? tap_can_send : NULL, + s->read_poll ? tap_send : NULL, + s->write_poll ? tap_writable : NULL, + s); +} + +static void tap_read_poll(TAPState *s, int enable) +{ + s->read_poll = !!enable; + tap_update_fd_handler(s); +} + +static void tap_write_poll(TAPState *s, int enable) +{ + s->write_poll = !!enable; + tap_update_fd_handler(s); +} + +static void tap_writable(void *opaque) +{ + TAPState *s = opaque; + VLANClientState *vc; + + tap_write_poll(s, 0); + + for (vc = s->vc->vlan->first_client; vc; vc = vc->next) { + if (vc == s->vc) { + continue; + } + + if (!vc->fd_writable) { + continue; + } + + vc->fd_writable(vc->opaque); + } +} + #ifdef HAVE_IOVEC static ssize_t tap_receive_iov(void *opaque, const struct iovec *iov, int iovcnt) @@ -763,7 +810,12 @@ static ssize_t tap_receive_iov(void *opaque, const struct iovec *iov, do { len = writev(s->fd, iov, iovcnt); - } while (len == -1 && (errno == EINTR || errno == EAGAIN)); + } while (len == -1 && errno == EINTR); + + if (len == -1 && errno == EAGAIN) { + tap_write_poll(s, 1); + return -EAGAIN; + } return len; } @@ -956,6 +1008,39 @@ static void tap_set_offload(VLANClientState *vc, int csum, int tso4, int tso6, } #endif /* TUNSETOFFLOAD */ +#ifdef TUNSETSNDBUF +/* sndbuf should be set to a value lower than the tx queue + * capacity of any destination network interface. + * Ethernet NICs generally have txqueuelen=1000, so 1Mb is + * a good default, given a 1500 byte MTU. + */ +#define TAP_DEFAULT_SNDBUF 1024*1024 + +static void tap_set_sndbuf(TAPState *s, const char *sndbuf_str) +{ + int sndbuf = TAP_DEFAULT_SNDBUF; + + if (sndbuf_str) { + sndbuf = atoi(sndbuf_str); + } + + if (!sndbuf) { + sndbuf = INT_MAX; + } + + if (ioctl(s->fd, TUNSETSNDBUF, &sndbuf) == -1) { + fprintf(stderr, "TUNSETSNDBUF ioctl failed: %s\n", strerror(errno)); + } +} +#else +static void tap_set_sndbuf(TAPState *s, const char *sndbuf_str) +{ + if (sndbuf_str) { + fprintf(stderr, "No '-net tap,sndbuf=<nbytes>' support available\n"); + } +} +#endif /* TUNSETSNDBUF */ + static int launch_script(const char *setup_script, const char *ifname, int fd); static void tap_cleanup(VLANClientState *vc) @@ -965,7 +1050,8 @@ static void tap_cleanup(VLANClientState *vc) if (s->down_script[0]) launch_script(s->down_script, s->down_script_arg, s->fd); - qemu_set_fd_handler(s->fd, NULL, NULL, NULL); + tap_read_poll(s, 0); + tap_write_poll(s, 0); close(s->fd); qemu_free(s); } @@ -994,7 +1080,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, #ifdef TUNSETOFFLOAD s->vc->set_offload = tap_set_offload; #endif - qemu_set_fd_handler2(s->fd, tap_can_send, tap_send, NULL, s); + tap_read_poll(s, 1); snprintf(s->vc->info_str, sizeof(s->vc->info_str), "fd=%d", fd); return s; } @@ -1238,9 +1324,9 @@ static int launch_script(const char *setup_script, const char *ifname, int fd) return 0; } -static int net_tap_init(VLANState *vlan, const char *model, - const char *name, const char *ifname1, - const char *setup_script, const char *down_script) +static TAPState *net_tap_init(VLANState *vlan, const char *model, + const char *name, const char *ifname1, + const char *setup_script, const char *down_script) { TAPState *s; int fd; @@ -1254,17 +1340,17 @@ static int net_tap_init(VLANState *vlan, const char *model, vnet_hdr = 0; TFR(fd = tap_open(ifname, sizeof(ifname), &vnet_hdr)); if (fd < 0) - return -1; + return NULL; if (!setup_script || !strcmp(setup_script, "no")) setup_script = ""; if (setup_script[0] != '\0') { if (launch_script(setup_script, ifname, fd)) - return -1; + return NULL; } s = net_tap_fd_init(vlan, model, name, fd, vnet_hdr); if (!s) - return -1; + return NULL; snprintf(s->vc->info_str, sizeof(s->vc->info_str), "ifname=%s,script=%s,downscript=%s", @@ -1273,7 +1359,7 @@ static int net_tap_init(VLANState *vlan, const char *model, snprintf(s->down_script, sizeof(s->down_script), "%s", down_script); snprintf(s->down_script_arg, sizeof(s->down_script_arg), "%s", ifname); } - return 0; + return s; } #endif /* !_WIN32 */ @@ -1922,15 +2008,13 @@ int net_client_init(const char *device, const char *p) if (!strcmp(device, "tap")) { char ifname[64]; char setup_script[1024], down_script[1024]; + TAPState *s; int fd; vlan->nb_host_devs++; if (get_param_value(buf, sizeof(buf), "fd", p) > 0) { fd = strtol(buf, NULL, 0); fcntl(fd, F_SETFL, O_NONBLOCK); - ret = -1; - if (net_tap_fd_init(vlan, device, name, fd, - tap_probe_vnet_hdr(fd))) - ret = 0; + s = net_tap_fd_init(vlan, device, name, fd, tap_probe_vnet_hdr(fd)); } else { if (get_param_value(ifname, sizeof(ifname), "ifname", p) <= 0) { ifname[0] = '\0'; @@ -1941,7 +2025,17 @@ int net_client_init(const char *device, const char *p) if (get_param_value(down_script, sizeof(down_script), "downscript", p) == 0) { pstrcpy(down_script, sizeof(down_script), DEFAULT_NETWORK_DOWN_SCRIPT); } - ret = net_tap_init(vlan, device, name, ifname, setup_script, down_script); + s = net_tap_init(vlan, device, name, ifname, setup_script, down_script); + } + if (s != NULL) { + const char *sndbuf_str = NULL; + if (get_param_value(buf, sizeof(buf), "sndbuf", p)) { + sndbuf_str = buf; + } + tap_set_sndbuf(s, sndbuf_str); + ret = 0; + } else { + ret = -1; } } else #endif diff --git a/qemu/net.h b/qemu/net.h index 47a2461..f6710be 100644 --- a/qemu/net.h +++ b/qemu/net.h @@ -20,6 +20,7 @@ struct VLANClientState { /* Packets may still be sent if this returns zero. It's used to rate-limit the slirp code. */ IOCanRWHandler *fd_can_read; + IOHandler *fd_writable; NetCleanup *cleanup; LinkStatusChanged *link_status_changed; int link_down; -- 1.6.3.rc4.29.g8146