Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 3199

kernel-2.6.18-238.el5.src.rpm

From: Hideo AOKI <haoki@redhat.com>
Date: Fri, 4 Jan 2008 18:49:29 -0500
Subject: [net] udp: add memory accounting
Message-id: 477EC609.7030304@redhat.com
O-Subject: [RHEL 5.2 PATCH] bz#223593: [2/3] add sysctls of UDP memory limitation: add UDP memory accounting
Bugzilla: 223593

BZ#:
------
https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=223593

Description:
------------

This patch set implements one of the feature requests for RHEL 5.2. The
feature enables us to set maximum amount memory of UDP socket like TCP.

This patch adds sysctl parameters for UDP memory limitation and actual
receive buffer accounting to both of UDP IPv4 and UDP IPv6.

To backport the original patch, I added new sysctl IDs in
include/linux/sysctl.h because RHEL 5 kernel don't support
CTL_UNNUMBERED.

This patch is based on 2.6.18-58.el5.

kABI Status:
------------
There is no kABI issues.

Brew:
-----
This patch set was built on all platforms. Task ID is 1094460.
The build ID is 67754.

Upstream Status:
----------------
The original patch is accepted into net-2.6.25 tree.

http://git.kernel.org/?p=linux/kernel/git/davem/net-2.6.25.git;a=commitdiff;h=3f8aac37bd3883109aab7a3b4782c552b4d73535

Test Status:
------------
My colleague and I tested this patch set on i686, x86_64, and ia64.
We did network connection tests, sysctl parameter setting tests, and
stress tests using netperf.

Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: "David S. Miller" <davem@redhat.com>
Acked-by: Herbert Xu <herbert.xu@redhat.com>
Acked-by: Neil Horman <nhorman@redhat.com>

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 307cd4e..ee2aac2 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -375,6 +375,33 @@ tcp_slow_start_after_idle - BOOLEAN
 	be timed out after an idle period.
 	Default: 1
 
+UDP variables:
+
+udp_mem - vector of 3 INTEGERs: min, pressure, max
+	Number of pages allowed for queueing by all UDP sockets.
+
+	min: Below this number of pages UDP is not bothered about its
+	memory appetite. When amount of memory allocated by UDP exceeds
+	this number, UDP starts to moderate memory usage.
+
+	pressure: This value was introduced to follow format of tcp_mem.
+
+	max: Number of pages allowed for queueing by all UDP sockets.
+
+	Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+	Minimal size of receive buffer used by UDP sockets in moderation.
+	Each UDP socket is able to use the size for receiving data, even if
+	total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
+	Default: 4096
+
+udp_wmem_min - INTEGER
+	Minimal size of send buffer used by UDP sockets in moderation.
+	Each UDP socket is able to use the size for sending data, even if
+	total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
+	Default: 4096
+
 CIPSOv4 Variables:
 
 cipso_cache_enable - BOOLEAN
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7cb777e..97b08e8 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -429,6 +429,9 @@ enum
 	NET_CIPSOV4_CACHE_BUCKET_SIZE=119,
 	NET_CIPSOV4_RBM_OPTFMT=120,
 	NET_CIPSOV4_RBM_STRICTVALID=121,
+	NET_UDP_MEM=122,
+	NET_UDP_RMEM_MIN=123,
+	NET_UDP_WMEM_MIN=124,
 };
 
 enum {
diff --git a/include/net/udp.h b/include/net/udp.h
index 766fba1..dd5e3b6 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -61,6 +61,13 @@ static inline int udp_lport_inuse(u16 num)
 
 extern struct proto udp_prot;
 
+extern atomic_t udp_memory_allocated;
+
+/* sysctl variables for udp */
+extern int sysctl_udp_mem[3];
+extern int sysctl_udp_rmem_min;
+extern int sysctl_udp_wmem_min;
+
 struct sk_buff;
 
 extern void	udp_err(struct sk_buff *, u32);
@@ -101,4 +108,6 @@ extern void udp_proc_unregister(struct udp_seq_afinfo *afinfo);
 extern int  udp4_proc_init(void);
 extern void udp4_proc_exit(void);
 #endif
+
+extern void udp_init(void);
 #endif	/* _UDP_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 57e4dcb..af39956 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -134,6 +134,8 @@ void inet_sock_destruct(struct sock *sk)
 	__skb_queue_purge(&sk->sk_receive_queue);
 	__skb_queue_purge(&sk->sk_error_queue);
 
+	sk_mem_reclaim(sk);
+
 	if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
 		printk("Attempt to release TCP socket in state %d %p\n",
 		       sk->sk_state, sk);
@@ -1320,6 +1322,8 @@ static int __init inet_init(void)
 	/* Setup TCP slab cache for open requests. */
 	tcp_init();
 
+	/* Setup UDP memory threshold */
+	udp_init();
 
 	/*
 	 *	Set the ICMP layer up
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 40f1a60..db31572 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -65,7 +65,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 		   fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
 		   tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
 		   atomic_read(&tcp_memory_allocated));
-	seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
+	seq_printf(seq, "UDP: inuse %d mem %d\n", fold_prot_inuse(&udp_prot),
+		   atomic_read(&udp_memory_allocated));
 	seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
 	seq_printf(seq,  "FRAG: inuse %d memory %d\n", ip_frag_nqueues,
 		   atomic_read(&ip_frag_mem));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4e6aa6a..1b19654 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -18,6 +18,7 @@
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
+#include <net/udp.h>
 #include <net/cipso_ipv4.h>
 
 /* From af_inet.c */
@@ -798,6 +799,36 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif /* CONFIG_NETLABEL */
+	{
+		.ctl_name	= NET_UDP_MEM,
+		.procname	= "udp_mem",
+		.data		= &sysctl_udp_mem,
+		.maxlen		= sizeof(sysctl_udp_mem),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{
+		.ctl_name	= NET_UDP_RMEM_MIN,
+		.procname	= "udp_rmem_min",
+		.data		= &sysctl_udp_rmem_min,
+		.maxlen		= sizeof(sysctl_udp_rmem_min),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{
+		.ctl_name	= NET_UDP_WMEM_MIN,
+		.procname	= "udp_wmem_min",
+		.data		= &sysctl_udp_wmem_min,
+		.maxlen		= sizeof(sysctl_udp_wmem_min),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0bf7923..e443cff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -81,6 +81,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <linux/bootmem.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
 #include <linux/module.h>
@@ -118,6 +119,17 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);
 
+int sysctl_udp_mem[3] __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
+EXPORT_SYMBOL(sysctl_udp_mem);
+EXPORT_SYMBOL(sysctl_udp_rmem_min);
+EXPORT_SYMBOL(sysctl_udp_wmem_min);
+
+atomic_t udp_memory_allocated;
+EXPORT_SYMBOL(udp_memory_allocated);
+
 /* Shared by v4/v6 udp. */
 int udp_port_rover;
 
@@ -843,14 +855,18 @@ try_again:
 		err = skb->len - sizeof(struct udphdr);
   
 out_free:
+	lock_sock(sk);
   	skb_free_datagram(sk, skb);
+	release_sock(sk);
 out:
   	return err;
 
 csum_copy_err:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 
+	lock_sock(sk);
 	skb_kill_datagram(sk, skb, flags);
+	release_sock(sk);
 
 	if (noblock)
 		return -EAGAIN;	
@@ -1067,7 +1083,15 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
 				skb1 = skb_clone(skb, GFP_ATOMIC);
 
 			if(skb1) {
-				int ret = udp_queue_rcv_skb(sk, skb1);
+				int ret = 0;
+
+				bh_lock_sock_nested(sk);
+				if (!sock_owned_by_user(sk))
+					ret = udp_queue_rcv_skb(sk, skb1);
+				else
+					sk_add_backlog(sk, skb1);
+				bh_unlock_sock(sk);
+
 				if (ret > 0)
 					/* we should probably re-process instead
 					 * of dropping packets here. */
@@ -1140,7 +1164,13 @@ int udp_rcv(struct sk_buff *skb)
 	sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
 
 	if (sk != NULL) {
-		int ret = udp_queue_rcv_skb(sk, skb);
+		int ret = 0;
+		bh_lock_sock_nested(sk);
+		if (!sock_owned_by_user(sk))
+			ret = udp_queue_rcv_skb(sk, skb);
+		else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
 		sock_put(sk);
 
 		/* a return value > 0 means to resubmit the input, but
@@ -1389,6 +1419,10 @@ struct proto udp_prot = {
 	.hash		   = udp_v4_hash,
 	.unhash		   = udp_v4_unhash,
 	.get_port	   = udp_v4_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = sysctl_udp_mem,
+	.sysctl_wmem	   = &sysctl_udp_wmem_min,
+	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
@@ -1581,6 +1615,25 @@ void udp4_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */
 
+void __init udp_init(void)
+{
+	unsigned long limit;
+
+	/* Set the pressure threshold up by the same strategy of TCP. It is a
+	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+	 * toward zero with the amount of memory, with a floor of 128 pages.
+	 */
+	limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+	limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+	limit = max(limit, 128UL);
+	sysctl_udp_mem[0] = limit / 4 * 3;
+	sysctl_udp_mem[1] = limit;
+	sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
+
+	sysctl_udp_rmem_min = SK_MEM_QUANTUM;
+	sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+}
+
 EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
 EXPORT_SYMBOL(udp_hash_lock);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index bf63dc1..1d3047c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -293,12 +293,16 @@ try_again:
 		err = skb->len - sizeof(struct udphdr);
 
 out_free:
+	lock_sock(sk);
 	skb_free_datagram(sk, skb);
+	release_sock(sk);
 out:
 	return err;
 
 csum_copy_err:
+	lock_sock(sk);
 	skb_kill_datagram(sk, skb, flags);
+	release_sock(sk);
 
 	if (flags & MSG_DONTWAIT) {
 		UDP6_INC_STATS_USER(UDP_MIB_INERRORS);
@@ -424,10 +428,21 @@ static void udpv6_mcast_deliver(struct udphdr *uh,
 	while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr,
 					uh->source, saddr, dif))) {
 		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
-		if (buff)
-			udpv6_queue_rcv_skb(sk2, buff);
+		if (buff) {
+			bh_lock_sock_nested(sk2);
+			if (!sock_owned_by_user(sk2))
+				udpv6_queue_rcv_skb(sk2, buff);
+			else
+				sk_add_backlog(sk2, buff);
+			bh_unlock_sock(sk2);
+		}
 	}
-	udpv6_queue_rcv_skb(sk, skb);
+	bh_lock_sock_nested(sk);
+	if (!sock_owned_by_user(sk))
+		udpv6_queue_rcv_skb(sk, skb);
+	else
+		sk_add_backlog(sk, skb);
+	bh_unlock_sock(sk);
 out:
 	read_unlock(&udp_hash_lock);
 }
@@ -512,7 +527,12 @@ static int udpv6_rcv(struct sk_buff **pskb)
 	
 	/* deliver */
 	
-	udpv6_queue_rcv_skb(sk, skb);
+	bh_lock_sock_nested(sk);
+	if (!sock_owned_by_user(sk))
+		udpv6_queue_rcv_skb(sk, skb);
+	else
+		sk_add_backlog(sk, skb);
+	bh_unlock_sock(sk);
 	sock_put(sk);
 	return(0);
 
@@ -1092,6 +1112,10 @@ struct proto udpv6_prot = {
 	.hash		   = udp_v6_hash,
 	.unhash		   = udp_v6_unhash,
 	.get_port	   = udp_v6_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = sysctl_udp_mem,
+	.sysctl_wmem	   = &sysctl_udp_wmem_min,
+	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp6_sock),
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,