Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 356

kernel-2.6.18-238.el5.src.rpm

From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 3 Nov 2009 11:36:36 -0500
Subject: [block] cfq-iosched: add close cooperator code
Message-id: 1257266206-24003-3-git-send-email-jmoyer@redhat.com
O-Subject: [PATCH 02/12] cfq-iosched: add close cooperator code
Bugzilla: 456181 448130 427709
RH-Acked-by: Rik van Riel <riel@redhat.com>
RH-Acked-by: Josef Bacik <josef@redhat.com>
RH-Acked-by: Vivek Goyal <vgoyal@redhat.com>
RH-Acked-by: Jeff Layton <jlayton@redhat.com>

This is a backport of commit a36e71f996e25d6213f57951f7ae1874086ec57e:

    If we have processes that are working in close proximity to each
    other on disk, we don't want to idle wait. Instead allow the close
    process to issue a request, getting better aggregate bandwidth.
    The anticipatory scheduler has similar checks, noop and deadline do
    not need it since they don't care about process <-> io mappings.

    The code for CFQ is a little more involved though, since we split
    request queues into per-process contexts.

    This fixes a performance problem with eg dump(8), since it uses
    several processes in some silly attempt to speed IO up. Even if
    dump(8) isn't really a valid case (it should be fixed by using
    CLONE_IO), there are other cases where we see close processes
    and where idling ends up hurting performance.

    Credit goes to Jeff Moyer <jmoyer@redhat.com> for writing the
    initial implementation.

    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f7c069e..3772eea 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -100,6 +100,14 @@ struct cfq_data {
 	struct list_head busy_rr;
 	struct list_head cur_rr;
 	struct list_head idle_rr;
+
+	/*
+	 * Each priority tree is sorted by next_request position.  These
+	 * trees are used when determining if two or more queues are
+	 * interleaving requests (see cfq_close_cooperator).
+	 */
+	struct rb_root prio_trees[CFQ_PRIO_LISTS];
+
 	unsigned int busy_queues;
 
 	/*
@@ -172,6 +180,8 @@ struct cfq_queue {
 	unsigned int key;
 	/* on either rr or empty list of cfqd */
 	struct list_head cfq_list;
+	/* prio tree member */
+	struct rb_node p_node;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
@@ -220,6 +230,7 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_fifo_expire,
 	CFQ_CFQQ_FLAG_idle_window,
 	CFQ_CFQQ_FLAG_prio_changed,
+	CFQ_CFQQ_FLAG_coop,
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -244,6 +255,7 @@ CFQ_CFQQ_FNS(must_dispatch);
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
 CFQ_CFQQ_FNS(prio_changed);
+CFQ_CFQQ_FNS(coop);
 #undef CFQ_CFQQ_FNS
 
 enum cfq_rq_state_flags {
@@ -457,6 +469,71 @@ static void cfq_update_next_crq(struct cfq_rq *crq)
 		cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
 }
 
+
+static struct cfq_queue *
+cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector,
+		     struct rb_node **ret_parent, struct rb_node ***rb_link)
+{
+	struct rb_root *root = &cfqd->prio_trees[ioprio];
+	struct rb_node **p, *parent;
+	struct cfq_queue *cfqq = NULL;
+
+	parent = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct rb_node **n;
+
+		parent = *p;
+		cfqq = rb_entry(parent, struct cfq_queue, p_node);
+
+		/*
+		 * Sort strictly based on sector.  Smallest to the left,
+		 * largest to the right.
+		 */
+		if (sector > cfqq->next_crq->request->sector)
+			n = &(*p)->rb_right;
+		else if (sector < cfqq->next_crq->request->sector)
+			n = &(*p)->rb_left;
+		else
+			break;
+		p = n;
+	}
+
+	*ret_parent = parent;
+	if (rb_link)
+		*rb_link = p;
+	return NULL;
+}
+
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+	rb_erase(n, root);
+	RB_CLEAR_NODE(n);
+}
+
+static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio];
+	struct rb_node **p, *parent;
+	struct cfq_queue *__cfqq;
+
+	if (!RB_EMPTY_NODE(&cfqq->p_node))
+		rb_erase_init(&cfqq->p_node, root);
+
+	if (cfq_class_idle(cfqq))
+		return;
+	if (!cfqq->next_crq)
+		return;
+
+	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio,
+				      cfqq->next_crq->request->sector,
+				      &parent, &p);
+	BUG_ON(__cfqq);
+
+	rb_link_node(&cfqq->p_node, parent, p);
+	rb_insert_color(&cfqq->p_node, root);
+}
+
 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
@@ -510,6 +587,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 	}
 
 	list_add(&cfqq->cfq_list, entry);
+	cfq_prio_tree_add(cfqd, cfqq);
 }
 
 /*
@@ -532,6 +610,8 @@ cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 	list_move(&cfqq->cfq_list, &cfqd->empty_list);
+	if (!RB_EMPTY_NODE(&cfqq->p_node))
+		rb_erase_init(&cfqq->p_node, &cfqd->prio_trees[cfqq->ioprio]);
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
@@ -585,7 +665,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
 	struct cfq_queue *cfqq = crq->cfq_queue;
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *rq = crq->request;
-	struct cfq_rq *__alias;
+	struct cfq_rq *__alias, *prev;
 
 	crq->rb_key = rq_rb_key(rq);
 	cfqq->queued[cfq_crq_is_sync(crq)]++;
@@ -605,7 +685,14 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
 	/*
 	 * check if this request is a better next-serve candidate
 	 */
+	prev = cfqq->next_crq;
 	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
+
+	/*
+	 * adjust priority tree position, if ->next_crq changes
+	 */
+	if (prev != cfqq->next_crq)
+		cfq_prio_tree_add(cfqd, cfqq);
 }
 
 static inline void
@@ -875,9 +962,23 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 		return cfqd->last_position - rq->sector;
 }
 
-static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
 {
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_io_context *cic = cfqd->active_cic;
+
+	if (!sample_valid(cic->seek_samples))
+		return 0;
+
+	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
+}
+
+#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
+
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
+					      struct cfq_queue *cfqq)
+{
+	if (cfqq)
+		goto set_queue;
 
 	/*
 	 * if current list is non-empty, grab first entry. if it is empty,
@@ -907,21 +1008,97 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 			mod_timer(&cfqd->idle_class_timer, end);
 	}
 
+	if (cfqq)
+		cfq_clear_cfqq_coop(cfqq);
+
+set_queue:
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
 
-static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
+static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+				    struct cfq_queue *cur_cfqq)
 {
-	struct cfq_io_context *cic = cfqd->active_cic;
+	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio];
+	struct rb_node *parent, *node;
+	struct cfq_queue *__cfqq;
+	sector_t sector = cfqd->last_position;
 
-	if (!sample_valid(cic->seek_samples))
-		return 0;
+	if (RB_EMPTY_ROOT(root))
+		return NULL;
 
-	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
+	/*
+	 * First, if we find a request starting at the end of the last
+	 * request, choose it.
+	 */
+	__cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio,
+				      sector, &parent, NULL);
+	if (__cfqq)
+		return __cfqq;
+
+	/*
+	 * If the exact sector wasn't found, the parent of the NULL leaf
+	 * will contain the closest sector.
+	 */
+	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
+	if (cfq_rq_close(cfqd, __cfqq->next_crq->request))
+		return __cfqq;
+
+	if (__cfqq->next_crq->request->sector < sector)
+		node = rb_next(&__cfqq->p_node);
+	else
+		node = rb_prev(&__cfqq->p_node);
+	if (!node)
+		return NULL;
+
+	__cfqq = rb_entry(node, struct cfq_queue, p_node);
+	if (cfq_rq_close(cfqd, __cfqq->next_crq->request))
+		return __cfqq;
+
+	return NULL;
 }
 
-#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
+/*
+ * cfqd - obvious
+ * cur_cfqq - passed in so that we don't decide that the current queue is
+ *            closely cooperating with itself.
+ *
+ * So, basically we're assuming that the cur_cfqq has dispatched at least
+ * one request, and that cfqd->last_position reflects a position on the disk
+ * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
+ * assumption.
+ */
+static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
+					      struct cfq_queue *cur_cfqq,
+					      int probe)
+{
+	struct cfq_queue *cfqq;
+
+	/*
+	 * A valid cfq_io_context is necessary to compare requests against
+	 * the seek_mean of the current cfqq.
+	 */
+	if (!cfqd->active_cic)
+		return NULL;
+
+	/*
+	 * We should notice if some of the queues are cooperating, eg
+	 * working closely on the same area of the disk.  In that case,
+	 * we can group them together and don't waste time idling.
+	 */
+	cfqq = cfqq_close(cfqd, cur_cfqq);
+	if (!cfqq)
+		return NULL;
+
+	if (cfq_cfqq_coop(cfqq))
+		return NULL;
+
+	if (!probe)
+		cfq_mark_cfqq_coop(cfqq);
+	return cfqq;
+}
+
+
 
 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
@@ -1040,7 +1217,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
 	unsigned long now = jiffies;
-	struct cfq_queue *cfqq;
+	struct cfq_queue *cfqq, *new_cfqq = NULL;
 
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
@@ -1058,6 +1235,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
+	/*
+	 * If another queue has a request waiting within our mean seek
+	 * distance, let it run.  The expire code will check for close
+	 * cooperators and put the close queue at the front of the service
+	 * tree.
+	 */
+	else if ((new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0)))
+		goto expire;
 	else if (cfq_cfqq_dispatched(cfqq)) {
 		cfqq = NULL;
 		goto keep_queue;
@@ -1069,7 +1254,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 expire:
 	cfq_slice_expired(cfqd, 0);
 new_queue:
-	cfqq = cfq_set_active_queue(cfqd);
+	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
 }
@@ -1495,6 +1680,7 @@ retry:
 		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		INIT_LIST_HEAD(&cfqq->cfq_list);
 		INIT_LIST_HEAD(&cfqq->fifo);
+		RB_CLEAR_NODE(&cfqq->p_node);
 
 		cfqq->key = key;
 		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
@@ -1887,9 +2073,12 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 	 * or if we want to idle in case it has no pending requests.
 	 */
 	if (cfqd->active_queue == cfqq) {
+		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
+
 		if (time_after(now, cfqq->slice_end))
 			cfq_slice_expired(cfqd, 0);
-		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
+		else if (cfqq_empty &&
+			 !cfq_close_cooperator(cfqd, cfqq, 1) && sync)
 			cfq_arm_slice_timer(cfqd, cfqq);
 	}