Date: Thu, 21 Sep 2006 16:47:20 +0100 From: Alasdair G Kergon <agk@redhat.com> Subject: [RHEL5 PATCH 12/30] dm snapshot: tidy snapshot_map This patch rearranges the snapshot_map code so that the functional changes in subsequent patches are clearer. The only functional change is to replace the existing read lock with a write lock which the next patch needs. Index: linux-2.6.18.noarch/drivers/md/dm-snap.c =================================================================== --- linux-2.6.18.noarch.orig/drivers/md/dm-snap.c +++ linux-2.6.18.noarch/drivers/md/dm-snap.c @@ -851,7 +851,6 @@ static int snapshot_map(struct dm_target { struct exception *e; struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - int copy_needed = 0; int r = 1; chunk_t chunk; struct pending_exception *pe = NULL; @@ -866,29 +865,28 @@ static int snapshot_map(struct dm_target if (unlikely(bio_barrier(bio))) return -EOPNOTSUPP; + /* FIXME: should only take write lock if we need + * to copy an exception */ + down_write(&s->lock); + + if (!s->valid) { + r = -EIO; + goto out_unlock; + } + + /* If the block is already remapped - use that, else remap it */ + e = lookup_exception(&s->complete, chunk); + if (e) { + remap_exception(s, e, bio); + goto out_unlock; + } + /* * Write to snapshot - higher level takes care of RW/RO * flags so we should only get this if we are * writeable. */ if (bio_rw(bio) == WRITE) { - - /* FIXME: should only take write lock if we need - * to copy an exception */ - down_write(&s->lock); - - if (!s->valid) { - r = -EIO; - goto out_unlock; - } - - /* If the block is already remapped - use that, else remap it */ - e = lookup_exception(&s->complete, chunk); - if (e) { - remap_exception(s, e, bio); - goto out_unlock; - } - pe = __find_pending_exception(s, bio); if (!pe) { __invalidate_snapshot(s, pe, -ENOMEM); @@ -899,45 +897,27 @@ static int snapshot_map(struct dm_target remap_exception(s, &pe->e, bio); bio_list_add(&pe->snapshot_bios, bio); + r = 0; + if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; - copy_needed = 1; - } - - r = 0; - - out_unlock: - up_write(&s->lock); - - if (copy_needed) + up_write(&s->lock); start_copy(pe); - } else { + goto out; + } + } else /* * FIXME: this read path scares me because we * always use the origin when we have a pending * exception. However I can't think of a * situation where this is wrong - ejt. */ + bio->bi_bdev = s->origin->bdev; - /* Do reads */ - down_read(&s->lock); - - if (!s->valid) { - up_read(&s->lock); - return -EIO; - } - - /* See if it it has been remapped */ - e = lookup_exception(&s->complete, chunk); - if (e) - remap_exception(s, e, bio); - else - bio->bi_bdev = s->origin->bdev; - - up_read(&s->lock); - } - + out_unlock: + up_write(&s->lock); + out: return r; } Date: Thu, 21 Sep 2006 16:47:40 +0100 From: Alasdair G Kergon <agk@redhat.com> Subject: [RHEL5 PATCH 13/30] dm snapshot: tidy pending_complete This patch rearranges the pending_complete() code so that the functional changes in subsequent patches are clearer. By consolidating the error and the non-error paths, we can move error_snapshot_bios() and __flush_bios() in line. Index: linux-2.6.18.noarch/drivers/md/dm-snap.c =================================================================== --- linux-2.6.18.noarch.orig/drivers/md/dm-snap.c +++ linux-2.6.18.noarch/drivers/md/dm-snap.c @@ -609,26 +609,6 @@ static void error_bios(struct bio *bio) } } -static inline void error_snapshot_bios(struct pending_exception *pe) -{ - error_bios(bio_list_get(&pe->snapshot_bios)); -} - -static struct bio *__flush_bios(struct pending_exception *pe) -{ - /* - * If this pe is involved in a write to the origin and - * it is the last sibling to complete then release - * the bios for the original write to the origin. - */ - - if (pe->primary_pe && - atomic_dec_and_test(&pe->primary_pe->sibling_count)) - return bio_list_get(&pe->primary_pe->origin_bios); - - return NULL; -} - static void __invalidate_snapshot(struct dm_snapshot *s, struct pending_exception *pe, int err) { @@ -656,16 +636,15 @@ static void pending_complete(struct pend struct exception *e; struct pending_exception *primary_pe; struct dm_snapshot *s = pe->snap; - struct bio *flush = NULL; + struct bio *origin_bios = NULL; + struct bio *snapshot_bios = NULL; + int error = 0; if (!success) { /* Read/write error - snapshot is unusable */ down_write(&s->lock); __invalidate_snapshot(s, pe, -EIO); - flush = __flush_bios(pe); - up_write(&s->lock); - - error_snapshot_bios(pe); + error = 1; goto out; } @@ -673,42 +652,40 @@ static void pending_complete(struct pend if (!e) { down_write(&s->lock); __invalidate_snapshot(s, pe, -ENOMEM); - flush = __flush_bios(pe); - up_write(&s->lock); - - error_snapshot_bios(pe); + error = 1; goto out; } *e = pe->e; - /* - * Add a proper exception, and remove the - * in-flight exception from the list. - */ down_write(&s->lock); if (!s->valid) { - flush = __flush_bios(pe); - up_write(&s->lock); - free_exception(e); - - error_snapshot_bios(pe); + error = 1; goto out; } + /* + * Add a proper exception, and remove the + * in-flight exception from the list. + */ insert_exception(&s->complete, e); remove_exception(&pe->e); - flush = __flush_bios(pe); - - up_write(&s->lock); - - /* Submit any pending write bios */ - flush_bios(bio_list_get(&pe->snapshot_bios)); out: + snapshot_bios = bio_list_get(&pe->snapshot_bios); + primary_pe = pe->primary_pe; /* + * If this pe is involved in a write to the origin and + * it is the last sibling to complete then release + * the bios for the original write to the origin. + */ + if (primary_pe && + atomic_dec_and_test(&primary_pe->sibling_count)) + origin_bios = bio_list_get(&primary_pe->origin_bios); + + /* * Free the pe if it's not linked to an origin write or if * it's not itself a primary pe. */ @@ -721,8 +698,15 @@ static void pending_complete(struct pend if (primary_pe && !atomic_read(&primary_pe->sibling_count)) free_pending_exception(primary_pe); - if (flush) - flush_bios(flush); + up_write(&s->lock); + + /* Submit any pending write bios */ + if (error) + error_bios(snapshot_bios); + else + flush_bios(snapshot_bios); + + flush_bios(origin_bios); } static void commit_callback(void *context, int success) Date: Thu, 21 Sep 2006 16:47:55 +0100 From: Alasdair G Kergon <agk@redhat.com> Subject: [RHEL5 PATCH 14/30] dm snapshot: add workqueue Add a workqueue so that I/O can be queued up to be flushed from a separate thread (e.g. if local interrupts are disabled). A new per-snapshot spinlock pe_lock is introduced to protect queued_bios. Index: linux-2.6.18.noarch/drivers/md/dm-snap.c =================================================================== --- linux-2.6.18.noarch.orig/drivers/md/dm-snap.c +++ linux-2.6.18.noarch/drivers/md/dm-snap.c @@ -39,6 +39,9 @@ */ #define SNAPSHOT_PAGES 256 +struct workqueue_struct *ksnapd; +static void flush_queued_bios(void *data); + struct pending_exception { struct exception e; @@ -488,6 +491,7 @@ static int snapshot_ctr(struct dm_target s->active = 0; s->last_percent = 0; init_rwsem(&s->lock); + spin_lock_init(&s->pe_lock); s->table = ti->table; /* Allocate hash table for COW data */ @@ -523,6 +527,9 @@ static int snapshot_ctr(struct dm_target goto bad6; } + bio_list_init(&s->queued_bios); + INIT_WORK(&s->queued_bios_work, flush_queued_bios, s); + /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ if (register_snapshot(s)) { @@ -561,6 +568,8 @@ static void snapshot_dtr(struct dm_targe { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + flush_workqueue(ksnapd); + /* Prevent further origin writes from using this snapshot. */ /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); @@ -594,6 +603,19 @@ static void flush_bios(struct bio *bio) } } +static void flush_queued_bios(void *data) +{ + struct dm_snapshot *s = (struct dm_snapshot *) data; + struct bio *queued_bios; + unsigned long flags; + + spin_lock_irqsave(&s->pe_lock, flags); + queued_bios = bio_list_get(&s->queued_bios); + spin_unlock_irqrestore(&s->pe_lock, flags); + + flush_bios(queued_bios); +} + /* * Error a list of buffers. */ @@ -1240,8 +1262,17 @@ static int __init dm_snapshot_init(void) goto bad5; } + ksnapd = create_singlethread_workqueue("ksnapd"); + if (!ksnapd) { + DMERR("Failed to create ksnapd workqueue."); + r = -ENOMEM; + goto bad6; + } + return 0; + bad6: + mempool_destroy(pending_pool); bad5: kmem_cache_destroy(pending_cache); bad4: @@ -1259,6 +1290,8 @@ static void __exit dm_snapshot_exit(void { int r; + destroy_workqueue(ksnapd); + r = dm_unregister_target(&snapshot_target); if (r) DMERR("snapshot unregister failed %d", r); Index: linux-2.6.18.noarch/drivers/md/dm-snap.h =================================================================== --- linux-2.6.18.noarch.orig/drivers/md/dm-snap.h +++ linux-2.6.18.noarch/drivers/md/dm-snap.h @@ -10,7 +10,9 @@ #define DM_SNAPSHOT_H #include "dm.h" +#include "dm-bio-list.h" #include <linux/blkdev.h> +#include <linux/workqueue.h> struct exception_table { uint32_t hash_mask; @@ -112,10 +114,20 @@ struct dm_snapshot { struct exception_table pending; struct exception_table complete; + /* + * pe_lock protects all pending_exception operations and access + * as well as the snapshot_bios list. + */ + spinlock_t pe_lock; + /* The on disk metadata handler */ struct exception_store store; struct kcopyd_client *kcopyd_client; + + /* Queue of snapshot writes for ksnapd to flush */ + struct bio_list queued_bios; + struct work_struct queued_bios_work; }; /* Date: Thu, 21 Sep 2006 16:48:18 +0100 From: Alasdair G Kergon <agk@redhat.com> Subject: [RHEL5 PATCH 15/30] dm snapshot: tidy pe ref counting Rename sibling_count to ref_count and introduce get and put functions. Index: linux-2.6.18.noarch/drivers/md/dm-snap.c =================================================================== --- linux-2.6.18.noarch.orig/drivers/md/dm-snap.c +++ linux-2.6.18.noarch/drivers/md/dm-snap.c @@ -59,7 +59,7 @@ struct pending_exception { /* * The primary pending_exception is the one that holds - * the sibling_count and the list of origin_bios for a + * the ref_count and the list of origin_bios for a * group of pending_exceptions. It is always last to get freed. * These fields get set up when writing to the origin. */ @@ -72,7 +72,7 @@ struct pending_exception { * the sibling concerned and not pe->primary_pe->snap->lock unless * they are the same. */ - atomic_t sibling_count; + atomic_t ref_count; /* Pointer back to snapshot context */ struct dm_snapshot *snap; @@ -653,10 +653,46 @@ static void __invalidate_snapshot(struct dm_table_event(s->table); } +static void get_pending_exception(struct pending_exception *pe) +{ + atomic_inc(&pe->ref_count); +} + +static struct bio *put_pending_exception(struct pending_exception *pe) +{ + struct pending_exception *primary_pe; + struct bio *origin_bios = NULL; + + primary_pe = pe->primary_pe; + + /* + * If this pe is involved in a write to the origin and + * it is the last sibling to complete then release + * the bios for the original write to the origin. + */ + if (primary_pe && + atomic_dec_and_test(&primary_pe->ref_count)) + origin_bios = bio_list_get(&primary_pe->origin_bios); + + /* + * Free the pe if it's not linked to an origin write or if + * it's not itself a primary pe. + */ + if (!primary_pe || primary_pe != pe) + free_pending_exception(pe); + + /* + * Free the primary pe if nothing references it. + */ + if (primary_pe && !atomic_read(&primary_pe->ref_count)) + free_pending_exception(primary_pe); + + return origin_bios; +} + static void pending_complete(struct pending_exception *pe, int success) { struct exception *e; - struct pending_exception *primary_pe; struct dm_snapshot *s = pe->snap; struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; @@ -695,30 +731,7 @@ static void pending_complete(struct pend out: snapshot_bios = bio_list_get(&pe->snapshot_bios); - - primary_pe = pe->primary_pe; - - /* - * If this pe is involved in a write to the origin and - * it is the last sibling to complete then release - * the bios for the original write to the origin. - */ - if (primary_pe && - atomic_dec_and_test(&primary_pe->sibling_count)) - origin_bios = bio_list_get(&primary_pe->origin_bios); - - /* - * Free the pe if it's not linked to an origin write or if - * it's not itself a primary pe. - */ - if (!primary_pe || primary_pe != pe) - free_pending_exception(pe); - - /* - * Free the primary pe if nothing references it. - */ - if (primary_pe && !atomic_read(&primary_pe->sibling_count)) - free_pending_exception(primary_pe); + origin_bios = put_pending_exception(pe); up_write(&s->lock); @@ -829,7 +842,7 @@ __find_pending_exception(struct dm_snaps bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->primary_pe = NULL; - atomic_set(&pe->sibling_count, 1); + atomic_set(&pe->ref_count, 0); pe->snap = s; pe->started = 0; @@ -838,6 +851,7 @@ __find_pending_exception(struct dm_snaps return NULL; } + get_pending_exception(pe); insert_exception(&s->pending, &pe->e); out: @@ -1012,7 +1026,7 @@ static int __origin_write(struct list_he * is already remapped in this snapshot * and trigger an exception if not. * - * sibling_count is initialised to 1 so pending_complete() + * ref_count is initialised to 1 so pending_complete() * won't destroy the primary_pe while we're inside this loop. */ e = lookup_exception(&snap->complete, chunk); @@ -1043,8 +1057,8 @@ static int __origin_write(struct list_he } if (!pe->primary_pe) { - atomic_inc(&primary_pe->sibling_count); pe->primary_pe = primary_pe; + get_pending_exception(primary_pe); } if (!pe->started) { @@ -1057,20 +1071,20 @@ static int __origin_write(struct list_he } if (!primary_pe) - goto out; + return r; /* * If this is the first time we're processing this chunk and - * sibling_count is now 1 it means all the pending exceptions + * ref_count is now 1 it means all the pending exceptions * got completed while we were in the loop above, so it falls to * us here to remove the primary_pe and submit any origin_bios. */ - if (first && atomic_dec_and_test(&primary_pe->sibling_count)) { + if (first && atomic_dec_and_test(&primary_pe->ref_count)) { flush_bios(bio_list_get(&primary_pe->origin_bios)); free_pending_exception(primary_pe); /* If we got here, pe_queue is necessarily empty. */ - goto out; + return r; } /* @@ -1079,7 +1093,6 @@ static int __origin_write(struct list_he list_for_each_entry_safe(pe, next_pe, &pe_queue, list) start_copy(pe); - out: return r; } Date: Thu, 21 Sep 2006 16:48:35 +0100 From: Alasdair G Kergon <agk@redhat.com> Subject: [RHEL5 PATCH 16/30] dm snapshot: fix freeing pending exception If a snapshot became invalid while there are outstanding pending_exceptions, when pending_complete() processes each one it forgets to remove the corresponding exception from its exception table before freeing it. Fix this by moving the 'out:' label up one statement so that remove_exception() is always called. Then __invalidate_exception() no longer needs to call it and its 'pe' argument become superfluous. Index: linux-2.6.18.noarch/drivers/md/dm-snap.c =================================================================== --- linux-2.6.18.noarch.orig/drivers/md/dm-snap.c +++ linux-2.6.18.noarch/drivers/md/dm-snap.c @@ -631,8 +631,7 @@ static void error_bios(struct bio *bio) } } -static void __invalidate_snapshot(struct dm_snapshot *s, - struct pending_exception *pe, int err) +static void __invalidate_snapshot(struct dm_snapshot *s, int err) { if (!s->valid) return; @@ -642,9 +641,6 @@ static void __invalidate_snapshot(struct else if (err == -ENOMEM) DMERR("Invalidating snapshot: Unable to allocate exception."); - if (pe) - remove_exception(&pe->e); - if (s->store.drop_snapshot) s->store.drop_snapshot(&s->store); @@ -701,7 +697,7 @@ static void pending_complete(struct pend if (!success) { /* Read/write error - snapshot is unusable */ down_write(&s->lock); - __invalidate_snapshot(s, pe, -EIO); + __invalidate_snapshot(s, -EIO); error = 1; goto out; } @@ -709,7 +705,7 @@ static void pending_complete(struct pend e = alloc_exception(); if (!e) { down_write(&s->lock); - __invalidate_snapshot(s, pe, -ENOMEM); + __invalidate_snapshot(s, -ENOMEM); error = 1; goto out; } @@ -727,9 +723,9 @@ static void pending_complete(struct pend * in-flight exception from the list. */ insert_exception(&s->complete, e); - remove_exception(&pe->e); out: + remove_exception(&pe->e); snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = put_pending_exception(pe); @@ -909,7 +905,7 @@ static int snapshot_map(struct dm_target if (bio_rw(bio) == WRITE) { pe = __find_pending_exception(s, bio); if (!pe) { - __invalidate_snapshot(s, pe, -ENOMEM); + __invalidate_snapshot(s, -ENOMEM); r = -EIO; goto out_unlock; } @@ -1035,7 +1031,7 @@ static int __origin_write(struct list_he pe = __find_pending_exception(snap, bio); if (!pe) { - __invalidate_snapshot(snap, pe, -ENOMEM); + __invalidate_snapshot(snap, -ENOMEM); goto next_snapshot; }