From: Neil Horman <nhorman@redhat.com> Date: Thu, 8 Jan 2009 20:27:40 -0500 Subject: [net] deadlock in Hierarchical token bucket scheduler Message-id: 20090109012740.GA2668@hmsendeavour.rdu.redhat.com O-Subject: [RHEL 5.4 PATCH] net: Fix deadlock in Hierarchical token bucket scheduler (bz 474797) Bugzilla: 474797 RH-Acked-by: David Miller <davem@redhat.com> Hey all- Amazon reported seeing several of these deadlock instances: http://bugzilla.kernel.org/show_bug.cgi?id=8668 On their RHEL5 systems. Falvio gave them this patch, a backport of these two commits: ee39e10c27ca5293c72addb95bff864095e19904 0929c2dd83317813425b937fbc0041013b8685ff Which fix the deadlock (which is actually in the gen_estimator code), and fixes up the htb scheduler to use the gen_estimator fully. Tested by amazon with good results. Fixes bz 474797 Neil diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 3cad026..acc1ee0 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -79,27 +79,27 @@ struct gen_estimator { - struct gen_estimator *next; + struct list_head list; struct gnet_stats_basic *bstats; struct gnet_stats_rate_est *rate_est; spinlock_t *stats_lock; - unsigned interval; int ewma_log; u64 last_bytes; u32 last_packets; u32 avpps; u32 avbps; + struct rcu_head e_rcu; }; struct gen_estimator_head { struct timer_list timer; - struct gen_estimator *list; + struct list_head list; }; static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; -/* Estimator array lock */ +/* Protects against NULL dereference */ static DEFINE_RWLOCK(est_lock); static void est_timer(unsigned long arg) @@ -107,13 +107,17 @@ static void est_timer(unsigned long arg) int idx = (int)arg; struct gen_estimator *e; - read_lock(&est_lock); - for (e = elist[idx].list; e; e = e->next) { + rcu_read_lock(); + list_for_each_entry_rcu(e, &elist[idx].list, list) { u64 nbytes; u32 npackets; u32 rate; spin_lock(e->stats_lock); + read_lock(&est_lock); + if (e->bstats == NULL) + goto skip; + nbytes = e->bstats->bytes; npackets = e->bstats->packets; rate = (nbytes - e->last_bytes)<<(7 - idx); @@ -125,11 +129,14 @@ static void est_timer(unsigned long arg) e->last_packets = npackets; e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; e->rate_est->pps = (e->avpps+0x1FF)>>10; +skip: + read_unlock(&est_lock); spin_unlock(e->stats_lock); } - mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); - read_unlock(&est_lock); + if (!list_empty(&elist[idx].list)) + mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); + rcu_read_unlock(); } /** @@ -146,12 +153,17 @@ static void est_timer(unsigned long arg) * &rate_est with the statistics lock grabed during this period. * * Returns 0 on success or a negative error code. + * + * NOTE: Called under rtnl_mutex */ int gen_new_estimator(struct gnet_stats_basic *bstats, - struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt) + struct gnet_stats_rate_est *rate_est, + spinlock_t *stats_lock, + struct rtattr *opt) { struct gen_estimator *est; struct gnet_estimator *parm = RTA_DATA(opt); + int idx; if (RTA_PAYLOAD(opt) < sizeof(*parm)) return -EINVAL; @@ -163,7 +175,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats, if (est == NULL) return -ENOBUFS; - est->interval = parm->interval + 2; + idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; @@ -173,20 +185,25 @@ int gen_new_estimator(struct gnet_stats_basic *bstats, est->last_packets = bstats->packets; est->avpps = rate_est->pps<<10; - est->next = elist[est->interval].list; - if (est->next == NULL) { - init_timer(&elist[est->interval].timer); - elist[est->interval].timer.data = est->interval; - elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); - elist[est->interval].timer.function = est_timer; - add_timer(&elist[est->interval].timer); + if (!elist[idx].timer.function) { + INIT_LIST_HEAD(&elist[idx].list); + setup_timer(&elist[idx].timer, est_timer, idx); } - write_lock_bh(&est_lock); - elist[est->interval].list = est; - write_unlock_bh(&est_lock); + + if (list_empty(&elist[idx].list)) + mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); + + list_add_rcu(&est->list, &elist[idx].list); return 0; } +static void __gen_kill_estimator(struct rcu_head *head) +{ + struct gen_estimator *e = container_of(head, + struct gen_estimator, e_rcu); + kfree(e); +} + /** * gen_kill_estimator - remove a rate estimator * @bstats: basic statistics @@ -194,31 +211,32 @@ int gen_new_estimator(struct gnet_stats_basic *bstats, * * Removes the rate estimator specified by &bstats and &rate_est * and deletes the timer. + * + * NOTE: Called under rtnl_mutex */ void gen_kill_estimator(struct gnet_stats_basic *bstats, struct gnet_stats_rate_est *rate_est) { int idx; - struct gen_estimator *est, **pest; + struct gen_estimator *e, *n; for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { - int killed = 0; - pest = &elist[idx].list; - while ((est=*pest) != NULL) { - if (est->rate_est != rate_est || est->bstats != bstats) { - pest = &est->next; + + /* Skip non initialized indexes */ + if (!elist[idx].timer.function) + continue; + + list_for_each_entry_safe(e, n, &elist[idx].list, list) { + if (e->rate_est != rate_est || e->bstats != bstats) continue; - } write_lock_bh(&est_lock); - *pest = est->next; + e->bstats = NULL; write_unlock_bh(&est_lock); - kfree(est); - killed++; + list_del_rcu(&e->list); + call_rcu(&e->e_rcu, __gen_kill_estimator); } - if (killed && elist[idx].list == NULL) - del_timer(&elist[idx].timer); } } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 880a339..9ca4189 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -69,9 +69,7 @@ */ #define HTB_HSIZE 16 /* classid hash size */ -#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */ #undef HTB_DEBUG /* compile debugging support (activated by tc tool) */ -#define HTB_RATECM 1 /* whether to use rate computer */ #define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ #define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) #define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) @@ -147,12 +145,6 @@ struct htb_class struct tc_htb_xstats xstats;/* our special stats */ int refcnt; /* usage count of this class */ -#ifdef HTB_RATECM - /* rate measurement counters */ - unsigned long rate_bytes,sum_bytes; - unsigned long rate_packets,sum_packets; -#endif - /* topology */ int level; /* our level (see above) */ struct htb_class *parent; /* parent class */ @@ -247,10 +239,6 @@ struct htb_sched int rate2quantum; /* quant = rate / rate2quantum */ psched_time_t now; /* cached dequeue time */ struct timer_list timer; /* send delay timer */ -#ifdef HTB_RATECM - struct timer_list rttim; /* rate computer timer */ - int recmp_bucket; /* which hash bucket to recompute next */ -#endif /* non shaped skbs; let them go directly thru */ struct sk_buff_head direct_queue; @@ -783,35 +771,6 @@ static void htb_timer(unsigned long arg) netif_schedule(sch->dev); } -#ifdef HTB_RATECM -#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0 -static void htb_rate_timer(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc*)arg; - struct htb_sched *q = qdisc_priv(sch); - struct list_head *p; - - /* lock queue so that we can muck with it */ - HTB_QLOCK(sch); - HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies); - - q->rttim.expires = jiffies + HZ; - add_timer(&q->rttim); - - /* scan and recompute one bucket at time */ - if (++q->recmp_bucket >= HTB_HSIZE) - q->recmp_bucket = 0; - list_for_each (p,q->hash+q->recmp_bucket) { - struct htb_class *cl = list_entry(p,struct htb_class,hlist); - HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n", - cl->classid,cl->sum_bytes,cl->sum_packets); - RT_GEN (cl->sum_bytes,cl->rate_bytes); - RT_GEN (cl->sum_packets,cl->rate_packets); - } - HTB_QUNLOCK(sch); -} -#endif - /** * htb_charge_class - charges amount "bytes" to leaf and ancestors * @@ -875,11 +834,6 @@ static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, htb_add_to_wait_tree (q,cl,diff,1); } -#ifdef HTB_RATECM - /* update rate counters */ - cl->sum_bytes += bytes; cl->sum_packets++; -#endif - /* update byte stats except for leaves which are already updated */ if (cl->level) { cl->bstats.bytes += bytes; @@ -1272,13 +1226,6 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt) q->timer.function = htb_timer; q->timer.data = (unsigned long)sch; -#ifdef HTB_RATECM - init_timer(&q->rttim); - q->rttim.function = htb_rate_timer; - q->rttim.data = (unsigned long)sch; - q->rttim.expires = jiffies + HZ; - add_timer(&q->rttim); -#endif if ((q->rate2quantum = gopt->rate2quantum) < 1) q->rate2quantum = 1; q->defcls = gopt->defcls; @@ -1360,11 +1307,6 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct htb_class *cl = (struct htb_class*)arg; -#ifdef HTB_RATECM - cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE); - cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE); -#endif - if (!cl->level && cl->un.leaf.q) cl->qstats.qlen = cl->un.leaf.q->q.qlen; cl->xstats.tokens = cl->tokens; @@ -1439,6 +1381,7 @@ static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl) sch->q.qlen -= cl->un.leaf.q->q.qlen; qdisc_destroy(cl->un.leaf.q); } + gen_kill_estimator(&cl->bstats, &cl->rate_est); qdisc_put_rtab(cl->rate); qdisc_put_rtab(cl->ceil); @@ -1468,9 +1411,6 @@ static void htb_destroy(struct Qdisc* sch) HTB_DBG(0,1,"htb_destroy q=%p\n",q); del_timer_sync (&q->timer); -#ifdef HTB_RATECM - del_timer_sync (&q->rttim); -#endif /* This line used to be after htb_destroy_class call below and surprisingly it worked in 2.4. But it must precede it because filter need its target class alive to be able to call @@ -1549,6 +1489,21 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) { /* new class */ struct Qdisc *new_q; + struct { + struct rtattr rta; + struct gnet_estimator opt; + } est = { + .rta = { + .rta_len = RTA_LENGTH(sizeof(est.opt)), + .rta_type = TCA_RATE, + }, + .opt = { + /* 4s interval, 16s averaging constant */ + .interval = 2, + .ewma_log = 2, + }, + }; + /* check for valid classid */ if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch)) goto failure; @@ -1562,6 +1517,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL) goto failure; + gen_new_estimator(&cl->bstats, &cl->rate_est, + &sch->dev->queue_lock, + tca[TCA_RATE-1] ? : &est.rta); cl->refcnt = 1; INIT_LIST_HEAD(&cl->sibling); INIT_LIST_HEAD(&cl->hlist); @@ -1614,7 +1572,13 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->pq_node.rb_color = -1; } #endif - } else sch_tree_lock(sch); + } else { + if (tca[TCA_RATE-1]) + gen_replace_estimator(&cl->bstats, &cl->rate_est, + &sch->dev->queue_lock, + tca[TCA_RATE-1]); + sch_tree_lock(sch); + } /* it used to be a nasty bug here, we have to check that node is really leaf before changing cl->un.leaf ! */