Sophie

Sophie

distrib > Scientific%20Linux > 5x > x86_64 > by-pkgid > 27922b4260f65d317aabda37e42bbbff > files > 707

kernel-2.6.18-238.el5.src.rpm

Date: Thu, 21 Sep 2006 16:50:35 +0100
From: Alasdair G Kergon <agk@redhat.com>
Subject: [RHEL5 PATCH 24/30] dm crypt: restructure for workqueue change

Restructure part of the dm-crypt code in preparation
for workqueue changes.

Use 'base_bio' or 'clone' variable names consistently throughout.
No functional changes are included in this patch.

Index: linux-2.6.18.noarch/drivers/md/dm-crypt.c
===================================================================
--- linux-2.6.18.noarch.orig/drivers/md/dm-crypt.c
+++ linux-2.6.18.noarch/drivers/md/dm-crypt.c
@@ -29,7 +29,7 @@
  */
 struct crypt_io {
 	struct dm_target *target;
-	struct bio *bio;
+	struct bio *base_bio;
 	struct bio *first_clone;
 	struct work_struct work;
 	atomic_t pending;
@@ -315,7 +315,7 @@ static struct bio *
 crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
                    struct bio *base_bio, unsigned int *bio_vec_idx)
 {
-	struct bio *bio;
+	struct bio *clone;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
 	unsigned int i;
@@ -326,23 +326,23 @@ crypt_alloc_buffer(struct crypt_config *
 	 * FIXME: Is this really intelligent?
 	 */
 	if (base_bio)
-		bio = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC);
+		clone = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC);
 	else
-		bio = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs);
-	if (!bio)
+		clone = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs);
+	if (!clone)
 		return NULL;
 
 	/* if the last bio was not complete, continue where that one ended */
-	bio->bi_idx = *bio_vec_idx;
-	bio->bi_vcnt = *bio_vec_idx;
-	bio->bi_size = 0;
-	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+	clone->bi_idx = *bio_vec_idx;
+	clone->bi_vcnt = *bio_vec_idx;
+	clone->bi_size = 0;
+	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
 
-	/* bio->bi_idx pages have already been allocated */
-	size -= bio->bi_idx * PAGE_SIZE;
+	/* clone->bi_idx pages have already been allocated */
+	size -= clone->bi_idx * PAGE_SIZE;
 
-	for(i = bio->bi_idx; i < nr_iovecs; i++) {
-		struct bio_vec *bv = bio_iovec_idx(bio, i);
+	for (i = clone->bi_idx; i < nr_iovecs; i++) {
+		struct bio_vec *bv = bio_iovec_idx(clone, i);
 
 		bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
 		if (!bv->bv_page)
@@ -353,7 +353,7 @@ crypt_alloc_buffer(struct crypt_config *
 		 * return a partially allocated bio, the caller will then try
 		 * to allocate additional bios while submitting this partial bio
 		 */
-		if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1))
+		if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1))
 			gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
 
 		bv->bv_offset = 0;
@@ -362,13 +362,13 @@ crypt_alloc_buffer(struct crypt_config *
 		else
 			bv->bv_len = size;
 
-		bio->bi_size += bv->bv_len;
-		bio->bi_vcnt++;
+		clone->bi_size += bv->bv_len;
+		clone->bi_vcnt++;
 		size -= bv->bv_len;
 	}
 
-	if (!bio->bi_size) {
-		bio_put(bio);
+	if (!clone->bi_size) {
+		bio_put(clone);
 		return NULL;
 	}
 
@@ -376,13 +376,13 @@ crypt_alloc_buffer(struct crypt_config *
 	 * Remember the last bio_vec allocated to be able
 	 * to correctly continue after the splitting.
 	 */
-	*bio_vec_idx = bio->bi_vcnt;
+	*bio_vec_idx = clone->bi_vcnt;
 
-	return bio;
+	return clone;
 }
 
 static void crypt_free_buffer_pages(struct crypt_config *cc,
-                                    struct bio *bio, unsigned int bytes)
+                                    struct bio *clone, unsigned int bytes)
 {
 	unsigned int i, start, end;
 	struct bio_vec *bv;
@@ -396,19 +396,19 @@ static void crypt_free_buffer_pages(stru
 	 * A fix to the bi_idx issue in the kernel is in the works, so
 	 * we will hopefully be able to revert to the cleaner solution soon.
 	 */
-	i = bio->bi_vcnt - 1;
-	bv = bio_iovec_idx(bio, i);
-	end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size;
+	i = clone->bi_vcnt - 1;
+	bv = bio_iovec_idx(clone, i);
+	end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - clone->bi_size;
 	start = end - bytes;
 
 	start >>= PAGE_SHIFT;
-	if (!bio->bi_size)
-		end = bio->bi_vcnt;
+	if (!clone->bi_size)
+		end = clone->bi_vcnt;
 	else
 		end >>= PAGE_SHIFT;
 
-	for(i = start; i < end; i++) {
-		bv = bio_iovec_idx(bio, i);
+	for (i = start; i < end; i++) {
+		bv = bio_iovec_idx(clone, i);
 		BUG_ON(!bv->bv_page);
 		mempool_free(bv->bv_page, cc->page_pool);
 		bv->bv_page = NULL;
@@ -432,7 +432,7 @@ static void dec_pending(struct crypt_io 
 	if (io->first_clone)
 		bio_put(io->first_clone);
 
-	bio_endio(io->bio, io->bio->bi_size, io->error);
+	bio_endio(io->base_bio, io->base_bio->bi_size, io->error);
 
 	mempool_free(io, cc->io_pool);
 }
@@ -445,25 +445,133 @@ static void dec_pending(struct crypt_io 
  * queued here.
  */
 static struct workqueue_struct *_kcryptd_workqueue;
+static void kcryptd_do_work(void *data);
 
-static void kcryptd_do_work(void *data)
+static void kcryptd_queue_io(struct crypt_io *io)
 {
-	struct crypt_io *io = (struct crypt_io *) data;
-	struct crypt_config *cc = (struct crypt_config *) io->target->private;
+	INIT_WORK(&io->work, kcryptd_do_work, io);
+	queue_work(_kcryptd_workqueue, &io->work);
+}
+
+static int crypt_endio(struct bio *clone, unsigned int done, int error)
+{
+	struct crypt_io *io = clone->bi_private;
+	struct crypt_config *cc = io->target->private;
+	unsigned read_io = bio_data_dir(clone) == READ;
+
+	/*
+	 * free the processed pages, even if
+	 * it's only a partially completed write
+	 */
+	if (!read_io)
+		crypt_free_buffer_pages(cc, clone, done);
+
+	if (unlikely(clone->bi_size))
+		return 1;
+
+	/*
+	 * successful reads are decrypted by the worker thread
+	 */
+	if (!read_io)
+		goto out;
+
+	if (unlikely(!bio_flagged(clone, BIO_UPTODATE))) {
+		error = -EIO;
+		goto out;
+	}
+
+	bio_put(clone);
+	kcryptd_queue_io(io);
+	return 0;
+
+out:
+	bio_put(clone);
+	dec_pending(io, error);
+	return error;
+}
+
+static void clone_init(struct crypt_io *io, struct bio *clone)
+{
+	struct crypt_config *cc = io->target->private;
+
+	clone->bi_private = io;
+	clone->bi_end_io  = crypt_endio;
+	clone->bi_bdev    = cc->dev->bdev;
+	clone->bi_rw      = io->base_bio->bi_rw;
+}
+
+static struct bio *clone_read(struct crypt_io *io,
+			      sector_t sector)
+{
+	struct crypt_config *cc = io->target->private;
+	struct bio *base_bio = io->base_bio;
+	struct bio *clone;
+
+	/*
+	 * The block layer might modify the bvec array, so always
+	 * copy the required bvecs because we need the original
+	 * one in order to decrypt the whole bio data *afterwards*.
+	 */
+	clone = bio_alloc(GFP_NOIO, bio_segments(base_bio));
+	if (unlikely(!clone))
+		return NULL;
+
+	clone_init(io, clone);
+	clone->bi_idx = 0;
+	clone->bi_vcnt = bio_segments(base_bio);
+	clone->bi_size = base_bio->bi_size;
+	memcpy(clone->bi_io_vec, bio_iovec(base_bio),
+	       sizeof(struct bio_vec) * clone->bi_vcnt);
+	clone->bi_sector = cc->start + sector;
+
+	return clone;
+}
+
+static struct bio *clone_write(struct crypt_io *io,
+			       sector_t sector,
+			       unsigned *bvec_idx,
+			       struct convert_context *ctx)
+{
+	struct crypt_config *cc = io->target->private;
+	struct bio *base_bio = io->base_bio;
+	struct bio *clone;
+
+	clone = crypt_alloc_buffer(cc, base_bio->bi_size,
+				   io->first_clone, bvec_idx);
+	if (!clone)
+		return NULL;
+
+	ctx->bio_out = clone;
+
+	if (unlikely(crypt_convert(cc, ctx) < 0)) {
+		crypt_free_buffer_pages(cc, clone,
+		                        clone->bi_size);
+		bio_put(clone);
+		return NULL;
+	}
+
+	clone_init(io, clone);
+	clone->bi_sector = cc->start + sector;
+
+	return clone;
+}
+
+static void process_read_endio(struct crypt_io *io)
+{
+	struct crypt_config *cc = io->target->private;
 	struct convert_context ctx;
-	int r;
 
-	crypt_convert_init(cc, &ctx, io->bio, io->bio,
-	                   io->bio->bi_sector - io->target->begin, 0);
-	r = crypt_convert(cc, &ctx);
+	crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio,
+			   io->base_bio->bi_sector - io->target->begin, 0);
 
-	dec_pending(io, r);
+	dec_pending(io, crypt_convert(cc, &ctx));
 }
 
-static void kcryptd_queue_io(struct crypt_io *io)
+static void kcryptd_do_work(void *data)
 {
-	INIT_WORK(&io->work, kcryptd_do_work, io);
-	queue_work(_kcryptd_workqueue, &io->work);
+	struct crypt_io *io = data;
+
+	process_read_endio(io);
 }
 
 /*
@@ -477,7 +585,7 @@ static int crypt_decode_key(u8 *key, cha
 
 	buffer[2] = '\0';
 
-	for(i = 0; i < size; i++) {
+	for (i = 0; i < size; i++) {
 		buffer[0] = *hex++;
 		buffer[1] = *hex++;
 
@@ -500,7 +608,7 @@ static void crypt_encode_key(char *hex, 
 {
 	unsigned int i;
 
-	for(i = 0; i < size; i++) {
+	for (i = 0; i < size; i++) {
 		sprintf(hex, "%02x", *key);
 		hex += 2;
 		key++;
@@ -728,88 +836,10 @@ static void crypt_dtr(struct dm_target *
 	kfree(cc);
 }
 
-static int crypt_endio(struct bio *bio, unsigned int done, int error)
-{
-	struct crypt_io *io = (struct crypt_io *) bio->bi_private;
-	struct crypt_config *cc = (struct crypt_config *) io->target->private;
-
-	if (bio_data_dir(bio) == WRITE) {
-		/*
-		 * free the processed pages, even if
-		 * it's only a partially completed write
-		 */
-		crypt_free_buffer_pages(cc, bio, done);
-	}
-
-	if (bio->bi_size)
-		return 1;
-
-	bio_put(bio);
-
-	/*
-	 * successful reads are decrypted by the worker thread
-	 */
-	if ((bio_data_dir(bio) == READ)
-	    && bio_flagged(bio, BIO_UPTODATE)) {
-		kcryptd_queue_io(io);
-		return 0;
-	}
-
-	dec_pending(io, error);
-	return error;
-}
-
-static inline struct bio *
-crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
-            sector_t sector, unsigned int *bvec_idx,
-            struct convert_context *ctx)
-{
-	struct bio *clone;
-
-	if (bio_data_dir(bio) == WRITE) {
-		clone = crypt_alloc_buffer(cc, bio->bi_size,
-                                 io->first_clone, bvec_idx);
-		if (clone) {
-			ctx->bio_out = clone;
-			if (crypt_convert(cc, ctx) < 0) {
-				crypt_free_buffer_pages(cc, clone,
-				                        clone->bi_size);
-				bio_put(clone);
-				return NULL;
-			}
-		}
-	} else {
-		/*
-		 * The block layer might modify the bvec array, so always
-		 * copy the required bvecs because we need the original
-		 * one in order to decrypt the whole bio data *afterwards*.
-		 */
-		clone = bio_alloc(GFP_NOIO, bio_segments(bio));
-		if (clone) {
-			clone->bi_idx = 0;
-			clone->bi_vcnt = bio_segments(bio);
-			clone->bi_size = bio->bi_size;
-			memcpy(clone->bi_io_vec, bio_iovec(bio),
-			       sizeof(struct bio_vec) * clone->bi_vcnt);
-		}
-	}
-
-	if (!clone)
-		return NULL;
-
-	clone->bi_private = io;
-	clone->bi_end_io = crypt_endio;
-	clone->bi_bdev = cc->dev->bdev;
-	clone->bi_sector = cc->start + sector;
-	clone->bi_rw = bio->bi_rw;
-
-	return clone;
-}
-
 static int crypt_map(struct dm_target *ti, struct bio *bio,
 		     union map_info *map_context)
 {
-	struct crypt_config *cc = (struct crypt_config *) ti->private;
+	struct crypt_config *cc = ti->private;
 	struct crypt_io *io;
 	struct convert_context ctx;
 	struct bio *clone;
@@ -819,7 +849,7 @@ static int crypt_map(struct dm_target *t
 
 	io = mempool_alloc(cc->io_pool, GFP_NOIO);
 	io->target = ti;
-	io->bio = bio;
+	io->base_bio = bio;
 	io->first_clone = NULL;
 	io->error = 0;
 	atomic_set(&io->pending, 1); /* hold a reference */
@@ -832,7 +862,10 @@ static int crypt_map(struct dm_target *t
 	 * so repeat the whole process until all the data can be handled.
 	 */
 	while (remaining) {
-		clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx);
+		if (bio_data_dir(bio) == WRITE)
+			clone = clone_write(io, sector, &bvec_idx, &ctx);
+		else
+			clone = clone_read(io, sector);
 		if (!clone)
 			goto cleanup;
 

Date: Thu, 21 Sep 2006 16:50:50 +0100
From: Alasdair G Kergon <agk@redhat.com>
Subject: [RHEL5 PATCH 25/30] dm crypt: restructure write processing

Restructure the dm-crypt write processing in preparation
for workqueue changes in the next patches.

Index: linux-2.6.18.noarch/drivers/md/dm-crypt.c
===================================================================
--- linux-2.6.18.noarch.orig/drivers/md/dm-crypt.c
+++ linux-2.6.18.noarch/drivers/md/dm-crypt.c
@@ -500,12 +500,14 @@ static void clone_init(struct crypt_io *
 	clone->bi_rw      = io->base_bio->bi_rw;
 }
 
-static struct bio *clone_read(struct crypt_io *io,
-			      sector_t sector)
+static int process_read(struct crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
+	sector_t sector = base_bio->bi_sector - io->target->begin;
+
+	atomic_inc(&io->pending);
 
 	/*
 	 * The block layer might modify the bvec array, so always
@@ -513,47 +515,94 @@ static struct bio *clone_read(struct cry
 	 * one in order to decrypt the whole bio data *afterwards*.
 	 */
 	clone = bio_alloc(GFP_NOIO, bio_segments(base_bio));
-	if (unlikely(!clone))
-		return NULL;
+	if (unlikely(!clone)) {
+		dec_pending(io, -ENOMEM);
+		return 0;
+	}
 
 	clone_init(io, clone);
 	clone->bi_idx = 0;
 	clone->bi_vcnt = bio_segments(base_bio);
 	clone->bi_size = base_bio->bi_size;
+	clone->bi_sector = cc->start + sector;
 	memcpy(clone->bi_io_vec, bio_iovec(base_bio),
 	       sizeof(struct bio_vec) * clone->bi_vcnt);
-	clone->bi_sector = cc->start + sector;
 
-	return clone;
+	generic_make_request(clone);
+
+	return 0;
 }
 
-static struct bio *clone_write(struct crypt_io *io,
-			       sector_t sector,
-			       unsigned *bvec_idx,
-			       struct convert_context *ctx)
+static int process_write(struct crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
+	struct convert_context ctx;
+	unsigned remaining = base_bio->bi_size;
+	sector_t sector = base_bio->bi_sector - io->target->begin;
+	unsigned bvec_idx = 0;
+
+	atomic_inc(&io->pending);
+
+	crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1);
+
+	/*
+	 * The allocated buffers can be smaller than the whole bio,
+	 * so repeat the whole process until all the data can be handled.
+	 */
+	while (remaining) {
+		clone = crypt_alloc_buffer(cc, base_bio->bi_size,
+					   io->first_clone, &bvec_idx);
+		if (unlikely(!clone))
+			goto cleanup;
+
+		ctx.bio_out = clone;
+
+		if (unlikely(crypt_convert(cc, &ctx) < 0)) {
+			crypt_free_buffer_pages(cc, clone, clone->bi_size);
+			bio_put(clone);
+			goto cleanup;
+		}
+
+		clone_init(io, clone);
+		clone->bi_sector = cc->start + sector;
+
+		if (!io->first_clone) {
+			/*
+			 * hold a reference to the first clone, because it
+			 * holds the bio_vec array and that can't be freed
+			 * before all other clones are released
+			 */
+			bio_get(clone);
+			io->first_clone = clone;
+		}
+
+		atomic_inc(&io->pending);
+
+		remaining -= clone->bi_size;
+		sector += bio_sectors(clone);
 
-	clone = crypt_alloc_buffer(cc, base_bio->bi_size,
-				   io->first_clone, bvec_idx);
-	if (!clone)
-		return NULL;
-
-	ctx->bio_out = clone;
-
-	if (unlikely(crypt_convert(cc, ctx) < 0)) {
-		crypt_free_buffer_pages(cc, clone,
-		                        clone->bi_size);
-		bio_put(clone);
-		return NULL;
+		generic_make_request(clone);
+
+		/* out of memory -> run queues */
+		if (remaining)
+			blk_congestion_wait(bio_data_dir(clone), HZ/100);
 	}
 
-	clone_init(io, clone);
-	clone->bi_sector = cc->start + sector;
+	/* drop reference, clones could have returned before we reach this */
+	dec_pending(io, 0);
+	return 0;
+
+cleanup:
+	if (io->first_clone) {
+		dec_pending(io, -ENOMEM);
+		return 0;
+	}
 
-	return clone;
+	 /* if no bio has been dispatched yet, we can directly return the error */
+	mempool_free(io, cc->io_pool);
+	return -ENOMEM;
 }
 
 static void process_read_endio(struct crypt_io *io)
@@ -841,68 +890,19 @@ static int crypt_map(struct dm_target *t
 {
 	struct crypt_config *cc = ti->private;
 	struct crypt_io *io;
-	struct convert_context ctx;
-	struct bio *clone;
-	unsigned int remaining = bio->bi_size;
-	sector_t sector = bio->bi_sector - ti->begin;
-	unsigned int bvec_idx = 0;
 
 	io = mempool_alloc(cc->io_pool, GFP_NOIO);
+
 	io->target = ti;
 	io->base_bio = bio;
 	io->first_clone = NULL;
 	io->error = 0;
-	atomic_set(&io->pending, 1); /* hold a reference */
+	atomic_set(&io->pending, 0);
 
 	if (bio_data_dir(bio) == WRITE)
-		crypt_convert_init(cc, &ctx, NULL, bio, sector, 1);
-
-	/*
-	 * The allocated buffers can be smaller than the whole bio,
-	 * so repeat the whole process until all the data can be handled.
-	 */
-	while (remaining) {
-		if (bio_data_dir(bio) == WRITE)
-			clone = clone_write(io, sector, &bvec_idx, &ctx);
-		else
-			clone = clone_read(io, sector);
-		if (!clone)
-			goto cleanup;
-
-		if (!io->first_clone) {
-			/*
-			 * hold a reference to the first clone, because it
-			 * holds the bio_vec array and that can't be freed
-			 * before all other clones are released
-			 */
-			bio_get(clone);
-			io->first_clone = clone;
-		}
-		atomic_inc(&io->pending);
-
-		remaining -= clone->bi_size;
-		sector += bio_sectors(clone);
-
-		generic_make_request(clone);
-
-		/* out of memory -> run queues */
-		if (remaining)
-			blk_congestion_wait(bio_data_dir(clone), HZ/100);
-	}
+		return process_write(io);
 
-	/* drop reference, clones could have returned before we reach this */
-	dec_pending(io, 0);
-	return 0;
-
-cleanup:
-	if (io->first_clone) {
-		dec_pending(io, -ENOMEM);
-		return 0;
-	}
-
-	/* if no bio has been dispatched yet, we can directly return the error */
-	mempool_free(io, cc->io_pool);
-	return -ENOMEM;
+	return process_read(io);
 }
 
 static int crypt_status(struct dm_target *ti, status_type_t type,

Date: Thu, 21 Sep 2006 16:51:25 +0100
From: Alasdair G Kergon <agk@redhat.com>
Subject: [RHEL5 PATCH 26/30] dm crypt: move io to workqueue

This patch is designed to help dm-crypt comply with the
new constraints imposed by the following patch in -mm:
  md-dm-reduce-stack-usage-with-stacked-block-devices.patch

Under low memory the existing implementation relies upon waiting for
I/O submitted recursively to generic_make_request() completing before
the original generic_make_request() call can return.

This patch moves the I/O submission to a workqueue so the original
generic_make_request() can return immediately.

Index: linux-2.6.18.noarch/drivers/md/dm-crypt.c
===================================================================
--- linux-2.6.18.noarch.orig/drivers/md/dm-crypt.c
+++ linux-2.6.18.noarch/drivers/md/dm-crypt.c
@@ -34,6 +34,7 @@ struct crypt_io {
 	struct work_struct work;
 	atomic_t pending;
 	int error;
+	int post_process;
 };
 
 /*
@@ -441,8 +442,7 @@ static void dec_pending(struct crypt_io 
  * kcryptd:
  *
  * Needed because it would be very unwise to do decryption in an
- * interrupt context, so bios returning from read requests get
- * queued here.
+ * interrupt context.
  */
 static struct workqueue_struct *_kcryptd_workqueue;
 static void kcryptd_do_work(void *data);
@@ -466,12 +466,10 @@ static int crypt_endio(struct bio *clone
 	if (!read_io)
 		crypt_free_buffer_pages(cc, clone, done);
 
+	/* keep going - not finished yet */
 	if (unlikely(clone->bi_size))
 		return 1;
 
-	/*
-	 * successful reads are decrypted by the worker thread
-	 */
 	if (!read_io)
 		goto out;
 
@@ -481,6 +479,7 @@ static int crypt_endio(struct bio *clone
 	}
 
 	bio_put(clone);
+	io->post_process = 1;
 	kcryptd_queue_io(io);
 	return 0;
 
@@ -500,7 +499,7 @@ static void clone_init(struct crypt_io *
 	clone->bi_rw      = io->base_bio->bi_rw;
 }
 
-static int process_read(struct crypt_io *io)
+static void process_read(struct crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
@@ -517,7 +516,7 @@ static int process_read(struct crypt_io 
 	clone = bio_alloc(GFP_NOIO, bio_segments(base_bio));
 	if (unlikely(!clone)) {
 		dec_pending(io, -ENOMEM);
-		return 0;
+		return;
 	}
 
 	clone_init(io, clone);
@@ -529,11 +528,9 @@ static int process_read(struct crypt_io 
 	       sizeof(struct bio_vec) * clone->bi_vcnt);
 
 	generic_make_request(clone);
-
-	return 0;
 }
 
-static int process_write(struct crypt_io *io)
+static void process_write(struct crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
@@ -554,15 +551,18 @@ static int process_write(struct crypt_io
 	while (remaining) {
 		clone = crypt_alloc_buffer(cc, base_bio->bi_size,
 					   io->first_clone, &bvec_idx);
-		if (unlikely(!clone))
-			goto cleanup;
+		if (unlikely(!clone)) {
+			dec_pending(io, -ENOMEM);
+			return;
+		}
 
 		ctx.bio_out = clone;
 
 		if (unlikely(crypt_convert(cc, &ctx) < 0)) {
 			crypt_free_buffer_pages(cc, clone, clone->bi_size);
 			bio_put(clone);
-			goto cleanup;
+			dec_pending(io, -EIO);
+			return;
 		}
 
 		clone_init(io, clone);
@@ -578,31 +578,20 @@ static int process_write(struct crypt_io
 			io->first_clone = clone;
 		}
 
-		atomic_inc(&io->pending);
-
 		remaining -= clone->bi_size;
 		sector += bio_sectors(clone);
 
+		/* prevent bio_put of first_clone */
+		if (remaining)
+			atomic_inc(&io->pending);
+
 		generic_make_request(clone);
 
 		/* out of memory -> run queues */
 		if (remaining)
 			blk_congestion_wait(bio_data_dir(clone), HZ/100);
-	}
 
-	/* drop reference, clones could have returned before we reach this */
-	dec_pending(io, 0);
-	return 0;
-
-cleanup:
-	if (io->first_clone) {
-		dec_pending(io, -ENOMEM);
-		return 0;
 	}
-
-	 /* if no bio has been dispatched yet, we can directly return the error */
-	mempool_free(io, cc->io_pool);
-	return -ENOMEM;
 }
 
 static void process_read_endio(struct crypt_io *io)
@@ -620,7 +609,12 @@ static void kcryptd_do_work(void *data)
 {
 	struct crypt_io *io = data;
 
-	process_read_endio(io);
+	if (io->post_process)
+		process_read_endio(io);
+	else if (bio_data_dir(io->base_bio) == READ)
+		process_read(io);
+	else
+		process_write(io);
 }
 
 /*
@@ -892,17 +886,14 @@ static int crypt_map(struct dm_target *t
 	struct crypt_io *io;
 
 	io = mempool_alloc(cc->io_pool, GFP_NOIO);
-
 	io->target = ti;
 	io->base_bio = bio;
 	io->first_clone = NULL;
-	io->error = 0;
+	io->error = io->post_process = 0;
 	atomic_set(&io->pending, 0);
+	kcryptd_queue_io(io);
 
-	if (bio_data_dir(bio) == WRITE)
-		return process_write(io);
-
-	return process_read(io);
+	return 0;
 }
 
 static int crypt_status(struct dm_target *ti, status_type_t type,
@@ -1011,7 +1002,7 @@ error:
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version= {1, 2, 0},
+	.version= {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

Date: Thu, 21 Sep 2006 16:51:37 +0100
From: Alasdair G Kergon <agk@redhat.com>
Subject: [RHEL5 PATCH 27/30] dm crypt: use private biosets

In the low memory situation dm-crypt needs to use
a private mempool of bios to avoid blocking.

Index: linux-2.6.18.noarch/drivers/md/dm-crypt.c
===================================================================
--- linux-2.6.18.noarch.orig/drivers/md/dm-crypt.c
+++ linux-2.6.18.noarch/drivers/md/dm-crypt.c
@@ -76,6 +76,7 @@ struct crypt_config {
 	 */
 	mempool_t *io_pool;
 	mempool_t *page_pool;
+	struct bio_set *bs;
 
 	/*
 	 * crypto related data
@@ -92,7 +93,7 @@ struct crypt_config {
 	u8 key[0];
 };
 
-#define MIN_IOS        256
+#define MIN_IOS        16
 #define MIN_POOL_PAGES 32
 #define MIN_BIO_PAGES  8
 
@@ -307,6 +308,14 @@ static int crypt_convert(struct crypt_co
 	return r;
 }
 
+ static void dm_crypt_bio_destructor(struct bio *bio)
+ {
+	struct crypt_io *io = bio->bi_private;
+	struct crypt_config *cc = io->target->private;
+
+	bio_free(bio, cc->bs);
+ }
+
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
@@ -321,18 +330,17 @@ crypt_alloc_buffer(struct crypt_config *
 	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
 	unsigned int i;
 
-	/*
-	 * Use __GFP_NOMEMALLOC to tell the VM to act less aggressively and
-	 * to fail earlier.  This is not necessary but increases throughput.
-	 * FIXME: Is this really intelligent?
-	 */
-	if (base_bio)
-		clone = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC);
-	else
-		clone = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs);
+	if (base_bio) {
+		clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
+		__bio_clone(clone, base_bio);
+	} else
+		clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
+
 	if (!clone)
 		return NULL;
 
+	clone->bi_destructor = dm_crypt_bio_destructor;
+
 	/* if the last bio was not complete, continue where that one ended */
 	clone->bi_idx = *bio_vec_idx;
 	clone->bi_vcnt = *bio_vec_idx;
@@ -513,13 +521,14 @@ static void process_read(struct crypt_io
 	 * copy the required bvecs because we need the original
 	 * one in order to decrypt the whole bio data *afterwards*.
 	 */
-	clone = bio_alloc(GFP_NOIO, bio_segments(base_bio));
+	clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
 	if (unlikely(!clone)) {
 		dec_pending(io, -ENOMEM);
 		return;
 	}
 
 	clone_init(io, clone);
+	clone->bi_destructor = dm_crypt_bio_destructor;
 	clone->bi_idx = 0;
 	clone->bi_vcnt = bio_segments(base_bio);
 	clone->bi_size = base_bio->bi_size;
@@ -590,7 +599,6 @@ static void process_write(struct crypt_i
 		/* out of memory -> run queues */
 		if (remaining)
 			blk_congestion_wait(bio_data_dir(clone), HZ/100);
-
 	}
 }
 
@@ -807,6 +815,12 @@ static int crypt_ctr(struct dm_target *t
 		goto bad4;
 	}
 
+	cc->bs = bioset_create(MIN_IOS, MIN_IOS, 4);
+	if (!cc->bs) {
+		ti->error = "Cannot allocate crypt bioset";
+		goto bad_bs;
+	}
+
 	if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) {
 		ti->error = "Error setting key";
 		goto bad5;
@@ -846,6 +860,8 @@ static int crypt_ctr(struct dm_target *t
 	return 0;
 
 bad5:
+	bioset_free(cc->bs);
+bad_bs:
 	mempool_destroy(cc->page_pool);
 bad4:
 	mempool_destroy(cc->io_pool);
@@ -865,6 +881,7 @@ static void crypt_dtr(struct dm_target *
 {
 	struct crypt_config *cc = (struct crypt_config *) ti->private;
 
+	bioset_free(cc->bs);
 	mempool_destroy(cc->page_pool);
 	mempool_destroy(cc->io_pool);
 

Date: Thu, 21 Sep 2006 17:05:04 +0100
From: Alasdair G Kergon <agk@redhat.com>
Subject: [RHEL5 PATCH 28/30] dm: use private biosets

I found a problem within device-mapper that occurs in low-mem
situations. It was found using a mirror target but I think in theory it
would hit any setup that stacks device-mapper devices (like LVM on top
of multipath).

Since device-mapper core uses the common fs_bioset 
in clone_bio(), and a private, but still global, bio_set in split_bvec()
it is possible that the filesystem and the first level target successfully
get bios but the lower level target doesn't because there is no more memory
and the pool was drained by upper layers. So the remapping will be stuck
forever.  To solve this device-mapper core needs to use a private bio_set
for each device.

Index: linux-2.6.18.noarch/drivers/md/dm.c
===================================================================
--- linux-2.6.18.noarch.orig/drivers/md/dm.c
+++ linux-2.6.18.noarch/drivers/md/dm.c
@@ -102,6 +102,8 @@ struct mapped_device {
 	mempool_t *io_pool;
 	mempool_t *tio_pool;
 
+	struct bio_set *bs;
+
 	/*
 	 * Event handling.
 	 */
@@ -122,16 +124,10 @@ struct mapped_device {
 static kmem_cache_t *_io_cache;
 static kmem_cache_t *_tio_cache;
 
-static struct bio_set *dm_set;
-
 static int __init local_init(void)
 {
 	int r;
 
-	dm_set = bioset_create(16, 16, 4);
-	if (!dm_set)
-		return -ENOMEM;
-
 	/* allocate a slab for the dm_ios */
 	_io_cache = kmem_cache_create("dm_io",
 				      sizeof(struct dm_io), 0, 0, NULL, NULL);
@@ -165,8 +161,6 @@ static void local_exit(void)
 	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
 
-	bioset_free(dm_set);
-
 	if (unregister_blkdev(_major, _name) < 0)
 		DMERR("unregister_blkdev failed");
 
@@ -475,7 +469,7 @@ static int clone_endio(struct bio *bio, 
 {
 	int r = 0;
 	struct target_io *tio = bio->bi_private;
-	struct dm_io *io = tio->io;
+	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 
 	if (bio->bi_size)
@@ -494,9 +488,15 @@ static int clone_endio(struct bio *bio, 
 			return 1;
 	}
 
-	free_tio(io->md, tio);
-	dec_pending(io, error);
+	dec_pending(tio->io, error);
+
+	/*
+	 * Store md for cleanup instead of tio which is about to get freed.
+	 */
+	bio->bi_private = md->bs;
+
 	bio_put(bio);
+	free_tio(md, tio);
 	return r;
 }
 
@@ -525,6 +525,7 @@ static void __map_bio(struct dm_target *
 {
 	int r;
 	sector_t sector;
+	struct mapped_device *md;
 
 	/*
 	 * Sanity checks.
@@ -554,10 +555,14 @@ static void __map_bio(struct dm_target *
 
 	else if (r < 0) {
 		/* error the io and bail out */
-		struct dm_io *io = tio->io;
-		free_tio(tio->io->md, tio);
-		dec_pending(io, r);
+		md = tio->io->md;
+		dec_pending(tio->io, r);
+		/*
+		 * Store bio_set for cleanup.
+		 */
+		clone->bi_private = md->bs;
 		bio_put(clone);
+		free_tio(md, tio);
 	}
 }
 
@@ -573,7 +578,9 @@ struct clone_info {
 
 static void dm_bio_destructor(struct bio *bio)
 {
-	bio_free(bio, dm_set);
+	struct bio_set *bs = bio->bi_private;
+
+	bio_free(bio, bs);
 }
 
 /*
@@ -581,12 +588,12 @@ static void dm_bio_destructor(struct bio
  */
 static struct bio *split_bvec(struct bio *bio, sector_t sector,
 			      unsigned short idx, unsigned int offset,
-			      unsigned int len)
+			      unsigned int len, struct bio_set *bs)
 {
 	struct bio *clone;
 	struct bio_vec *bv = bio->bi_io_vec + idx;
 
-	clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
+	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
 	clone->bi_destructor = dm_bio_destructor;
 	*clone->bi_io_vec = *bv;
 
@@ -606,11 +613,13 @@ static struct bio *split_bvec(struct bio
  */
 static struct bio *clone_bio(struct bio *bio, sector_t sector,
 			     unsigned short idx, unsigned short bv_count,
-			     unsigned int len)
+			     unsigned int len, struct bio_set *bs)
 {
 	struct bio *clone;
 
-	clone = bio_clone(bio, GFP_NOIO);
+	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
+	__bio_clone(clone, bio);
+	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
 	clone->bi_vcnt = idx + bv_count;
@@ -641,7 +650,8 @@ static void __clone_and_map(struct clone
 		 * the remaining io with a single clone.
 		 */
 		clone = clone_bio(bio, ci->sector, ci->idx,
-				  bio->bi_vcnt - ci->idx, ci->sector_count);
+				  bio->bi_vcnt - ci->idx, ci->sector_count,
+				  ci->md->bs);
 		__map_bio(ti, clone, tio);
 		ci->sector_count = 0;
 
@@ -664,7 +674,8 @@ static void __clone_and_map(struct clone
 			len += bv_len;
 		}
 
-		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
+		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
+				  ci->md->bs);
 		__map_bio(ti, clone, tio);
 
 		ci->sector += len;
@@ -693,7 +704,8 @@ static void __clone_and_map(struct clone
 			len = min(remaining, max);
 
 			clone = split_bvec(bio, ci->sector, ci->idx,
-					   bv->bv_offset + offset, len);
+					   bv->bv_offset + offset, len,
+					   ci->md->bs);
 
 			__map_bio(ti, clone, tio);
 
@@ -961,6 +973,10 @@ static struct mapped_device *alloc_dev(i
 	if (!md->tio_pool)
 		goto bad3;
 
+	md->bs = bioset_create(16, 16, 4);
+	if (!md->bs)
+		goto bad_no_bioset;
+
 	md->disk = alloc_disk(1);
 	if (!md->disk)
 		goto bad4;
@@ -988,6 +1004,8 @@ static struct mapped_device *alloc_dev(i
 	return md;
 
  bad4:
+	bioset_free(md->bs);
+ bad_no_bioset:
 	mempool_destroy(md->tio_pool);
  bad3:
 	mempool_destroy(md->io_pool);
@@ -1012,6 +1030,7 @@ static void free_dev(struct mapped_devic
 	}
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
+	bioset_free(md->bs);
 	del_gendisk(md->disk);
 	free_minor(minor);