Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 2555

kernel-2.6.18-238.el5.src.rpm

From: Neil Horman <nhorman@redhat.com>
Date: Thu, 8 Jan 2009 20:27:40 -0500
Subject: [net] deadlock in Hierarchical token bucket scheduler
Message-id: 20090109012740.GA2668@hmsendeavour.rdu.redhat.com
O-Subject: [RHEL 5.4 PATCH] net: Fix deadlock in Hierarchical token bucket scheduler (bz 474797)
Bugzilla: 474797
RH-Acked-by: David Miller <davem@redhat.com>

Hey all-
	Amazon reported seeing several of these deadlock instances:
http://bugzilla.kernel.org/show_bug.cgi?id=8668
On their RHEL5 systems.  Falvio gave them this patch, a backport of these two
commits:
ee39e10c27ca5293c72addb95bff864095e19904
0929c2dd83317813425b937fbc0041013b8685ff

Which fix the deadlock (which is actually in the gen_estimator code), and fixes
up the htb scheduler to use the gen_estimator fully.  Tested by amazon with good
results.  Fixes bz 474797

Neil

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 3cad026..acc1ee0 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -79,27 +79,27 @@
 
 struct gen_estimator
 {
-	struct gen_estimator	*next;
+	struct list_head	list;
 	struct gnet_stats_basic	*bstats;
 	struct gnet_stats_rate_est	*rate_est;
 	spinlock_t		*stats_lock;
-	unsigned		interval;
 	int			ewma_log;
 	u64			last_bytes;
 	u32			last_packets;
 	u32			avpps;
 	u32			avbps;
+	struct rcu_head		e_rcu;
 };
 
 struct gen_estimator_head
 {
 	struct timer_list	timer;
-	struct gen_estimator	*list;
+	struct list_head	list;
 };
 
 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
 
-/* Estimator array lock */
+/* Protects against NULL dereference */
 static DEFINE_RWLOCK(est_lock);
 
 static void est_timer(unsigned long arg)
@@ -107,13 +107,17 @@ static void est_timer(unsigned long arg)
 	int idx = (int)arg;
 	struct gen_estimator *e;
 
-	read_lock(&est_lock);
-	for (e = elist[idx].list; e; e = e->next) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &elist[idx].list, list) {
 		u64 nbytes;
 		u32 npackets;
 		u32 rate;
 
 		spin_lock(e->stats_lock);
+		read_lock(&est_lock);
+		if (e->bstats == NULL)
+			goto skip;
+
 		nbytes = e->bstats->bytes;
 		npackets = e->bstats->packets;
 		rate = (nbytes - e->last_bytes)<<(7 - idx);
@@ -125,11 +129,14 @@ static void est_timer(unsigned long arg)
 		e->last_packets = npackets;
 		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
 		e->rate_est->pps = (e->avpps+0x1FF)>>10;
+skip:
+		read_unlock(&est_lock);
 		spin_unlock(e->stats_lock);
 	}
 
-	mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
-	read_unlock(&est_lock);
+	if (!list_empty(&elist[idx].list))
+		mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+	rcu_read_unlock();
 }
 
 /**
@@ -146,12 +153,17 @@ static void est_timer(unsigned long arg)
  * &rate_est with the statistics lock grabed during this period.
  * 
  * Returns 0 on success or a negative error code.
+ *
+ * NOTE: Called under rtnl_mutex
  */
 int gen_new_estimator(struct gnet_stats_basic *bstats,
-	struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
+		      struct gnet_stats_rate_est *rate_est,
+		      spinlock_t *stats_lock,
+		      struct rtattr *opt)
 {
 	struct gen_estimator *est;
 	struct gnet_estimator *parm = RTA_DATA(opt);
+	int idx;
 
 	if (RTA_PAYLOAD(opt) < sizeof(*parm))
 		return -EINVAL;
@@ -163,7 +175,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
 	if (est == NULL)
 		return -ENOBUFS;
 
-	est->interval = parm->interval + 2;
+	idx = parm->interval + 2;
 	est->bstats = bstats;
 	est->rate_est = rate_est;
 	est->stats_lock = stats_lock;
@@ -173,20 +185,25 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
 	est->last_packets = bstats->packets;
 	est->avpps = rate_est->pps<<10;
 
-	est->next = elist[est->interval].list;
-	if (est->next == NULL) {
-		init_timer(&elist[est->interval].timer);
-		elist[est->interval].timer.data = est->interval;
-		elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
-		elist[est->interval].timer.function = est_timer;
-		add_timer(&elist[est->interval].timer);
+	if (!elist[idx].timer.function) {
+		INIT_LIST_HEAD(&elist[idx].list);
+		setup_timer(&elist[idx].timer, est_timer, idx);
 	}
-	write_lock_bh(&est_lock);
-	elist[est->interval].list = est;
-	write_unlock_bh(&est_lock);
+
+	if (list_empty(&elist[idx].list))
+		mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
+
+	list_add_rcu(&est->list, &elist[idx].list);
 	return 0;
 }
 
+static void __gen_kill_estimator(struct rcu_head *head)
+{
+	struct gen_estimator *e = container_of(head,
+					struct gen_estimator, e_rcu);
+	kfree(e);
+}
+
 /**
  * gen_kill_estimator - remove a rate estimator
  * @bstats: basic statistics
@@ -194,31 +211,32 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
  *
  * Removes the rate estimator specified by &bstats and &rate_est
  * and deletes the timer.
+ *
+ * NOTE: Called under rtnl_mutex
  */
 void gen_kill_estimator(struct gnet_stats_basic *bstats,
 	struct gnet_stats_rate_est *rate_est)
 {
 	int idx;
-	struct gen_estimator *est, **pest;
+	struct gen_estimator *e, *n;
 
 	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
-		int killed = 0;
-		pest = &elist[idx].list;
-		while ((est=*pest) != NULL) {
-			if (est->rate_est != rate_est || est->bstats != bstats) {
-				pest = &est->next;
+
+		/* Skip non initialized indexes */
+		if (!elist[idx].timer.function)
+			continue;
+
+		list_for_each_entry_safe(e, n, &elist[idx].list, list) {
+			if (e->rate_est != rate_est || e->bstats != bstats)
 				continue;
-			}
 
 			write_lock_bh(&est_lock);
-			*pest = est->next;
+			e->bstats = NULL;
 			write_unlock_bh(&est_lock);
 
-			kfree(est);
-			killed++;
+			list_del_rcu(&e->list);
+			call_rcu(&e->e_rcu, __gen_kill_estimator);
 		}
-		if (killed && elist[idx].list == NULL)
-			del_timer(&elist[idx].timer);
 	}
 }
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 880a339..9ca4189 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -69,9 +69,7 @@
 */
 
 #define HTB_HSIZE 16	/* classid hash size */
-#define HTB_EWMAC 2	/* rate average over HTB_EWMAC*HTB_HSIZE sec */
 #undef HTB_DEBUG	/* compile debugging support (activated by tc tool) */
-#define HTB_RATECM 1    /* whether to use rate computer */
 #define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */
 #define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock)
 #define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock)
@@ -147,12 +145,6 @@ struct htb_class
     struct tc_htb_xstats xstats;/* our special stats */
     int refcnt;			/* usage count of this class */
 
-#ifdef HTB_RATECM
-    /* rate measurement counters */
-    unsigned long rate_bytes,sum_bytes;
-    unsigned long rate_packets,sum_packets;
-#endif
-
     /* topology */
     int level;			/* our level (see above) */
     struct htb_class *parent;	/* parent class */
@@ -247,10 +239,6 @@ struct htb_sched
     int rate2quantum;		/* quant = rate / rate2quantum */
     psched_time_t now;		/* cached dequeue time */
     struct timer_list timer;	/* send delay timer */
-#ifdef HTB_RATECM
-    struct timer_list rttim;	/* rate computer timer */
-    int recmp_bucket;		/* which hash bucket to recompute next */
-#endif
     
     /* non shaped skbs; let them go directly thru */
     struct sk_buff_head direct_queue;
@@ -783,35 +771,6 @@ static void htb_timer(unsigned long arg)
     netif_schedule(sch->dev);
 }
 
-#ifdef HTB_RATECM
-#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
-static void htb_rate_timer(unsigned long arg)
-{
-	struct Qdisc *sch = (struct Qdisc*)arg;
-	struct htb_sched *q = qdisc_priv(sch);
-	struct list_head *p;
-
-	/* lock queue so that we can muck with it */
-	HTB_QLOCK(sch);
-	HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies);
-
-	q->rttim.expires = jiffies + HZ;
-	add_timer(&q->rttim);
-
-	/* scan and recompute one bucket at time */
-	if (++q->recmp_bucket >= HTB_HSIZE) 
-		q->recmp_bucket = 0;
-	list_for_each (p,q->hash+q->recmp_bucket) {
-		struct htb_class *cl = list_entry(p,struct htb_class,hlist);
-		HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n",
-				cl->classid,cl->sum_bytes,cl->sum_packets);
-		RT_GEN (cl->sum_bytes,cl->rate_bytes);
-		RT_GEN (cl->sum_packets,cl->rate_packets);
-	}
-	HTB_QUNLOCK(sch);
-}
-#endif
-
 /**
  * htb_charge_class - charges amount "bytes" to leaf and ancestors
  *
@@ -875,11 +834,6 @@ static void htb_charge_class(struct htb_sched *q,struct htb_class *cl,
 				htb_add_to_wait_tree (q,cl,diff,1);
 		}
 		
-#ifdef HTB_RATECM
-		/* update rate counters */
-		cl->sum_bytes += bytes; cl->sum_packets++;
-#endif
-
 		/* update byte stats except for leaves which are already updated */
 		if (cl->level) {
 			cl->bstats.bytes += bytes;
@@ -1272,13 +1226,6 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt)
 	q->timer.function = htb_timer;
 	q->timer.data = (unsigned long)sch;
 
-#ifdef HTB_RATECM
-	init_timer(&q->rttim);
-	q->rttim.function = htb_rate_timer;
-	q->rttim.data = (unsigned long)sch;
-	q->rttim.expires = jiffies + HZ;
-	add_timer(&q->rttim);
-#endif
 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
 		q->rate2quantum = 1;
 	q->defcls = gopt->defcls;
@@ -1360,11 +1307,6 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 {
 	struct htb_class *cl = (struct htb_class*)arg;
 
-#ifdef HTB_RATECM
-	cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE);
-	cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE);
-#endif
-
 	if (!cl->level && cl->un.leaf.q)
 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
 	cl->xstats.tokens = cl->tokens;
@@ -1439,6 +1381,7 @@ static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl)
 		sch->q.qlen -= cl->un.leaf.q->q.qlen;
 		qdisc_destroy(cl->un.leaf.q);
 	}
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
 	qdisc_put_rtab(cl->rate);
 	qdisc_put_rtab(cl->ceil);
 	
@@ -1468,9 +1411,6 @@ static void htb_destroy(struct Qdisc* sch)
 	HTB_DBG(0,1,"htb_destroy q=%p\n",q);
 
 	del_timer_sync (&q->timer);
-#ifdef HTB_RATECM
-	del_timer_sync (&q->rttim);
-#endif
 	/* This line used to be after htb_destroy_class call below
 	   and surprisingly it worked in 2.4. But it must precede it 
 	   because filter need its target class alive to be able to call
@@ -1549,6 +1489,21 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 
 	if (!cl) { /* new class */
 		struct Qdisc *new_q;
+		struct {
+			struct rtattr		rta;
+			struct gnet_estimator	opt;
+		} est = {
+			.rta = {
+				.rta_len	= RTA_LENGTH(sizeof(est.opt)),
+				.rta_type	= TCA_RATE,
+			},
+			.opt = {
+				/* 4s interval, 16s averaging constant */
+				.interval	= 2,
+				.ewma_log	= 2,
+			},
+		};
+
 		/* check for valid classid */
 		if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch))
 			goto failure;
@@ -1562,6 +1517,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
 			goto failure;
 		
+		gen_new_estimator(&cl->bstats, &cl->rate_est,
+				  &sch->dev->queue_lock,
+				  tca[TCA_RATE-1] ? : &est.rta);
 		cl->refcnt = 1;
 		INIT_LIST_HEAD(&cl->sibling);
 		INIT_LIST_HEAD(&cl->hlist);
@@ -1614,7 +1572,13 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			cl->pq_node.rb_color = -1;
 		}
 #endif
-	} else sch_tree_lock(sch);
+	} else {
+		if (tca[TCA_RATE-1])
+			gen_replace_estimator(&cl->bstats, &cl->rate_est,
+					      &sch->dev->queue_lock,
+					      tca[TCA_RATE-1]);
+ 		sch_tree_lock(sch);
+	}
 
 	/* it used to be a nasty bug here, we have to check that node
            is really leaf before changing cl->un.leaf ! */