From: heinzm@redhat.com <heinzm@redhat.com> Date: Wed, 27 Aug 2008 14:16:42 +0200 Subject: [md] add device-mapper dirty region hash file Message-id: 20080827121637.900954252@redhat.com O-Subject: [PATCH RHEL5.3 437180 06/11] dm-region_hash.c.patch Bugzilla: 437180 This patch introduces the device-mapper dirty region hash, factored out of dm-mirror to allow for sharing it. Heinz Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com> diff --git a/drivers/md/dm-region_hash.c b/drivers/md/dm-region_hash.c new file mode 100644 index 0000000..7c1ab1f --- /dev/null +++ b/drivers/md/dm-region_hash.c @@ -0,0 +1,644 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-region_hash.h" + +#include <linux/dm-dirty-log.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "region hash" + +/*----------------------------------------------------------------- + * Region hash + * + * The set splits itself up into discrete regions. + * Each region can be in one of three states: + * + * o clean + * o dirty, + * o nosync. + * + * There is no need to put clean regions in the hash. + * + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to the caller, which will schedule the + * recovery io. + * + * recovered_regions: Regions that the caller has successfully + * recovered. rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'delayed_bios' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +enum region_hash_flags { + RECOVERY, +}; + +struct region_hash { + unsigned int max_recovery; /* Max # of regions to recover in parallel */ + unsigned long flags; + + /* Callback function to dispatch queued writes on recovered regions. */ + void (*dispatch)(void *context, struct bio_list *bios); + void *dispatch_context; + + /* Callback function to wakeup callers worker thread. */ + void (*wake)(void *context); + void *wake_context; + + uint32_t region_size; + unsigned int region_shift; + + /* holds persistent region state */ + struct dm_dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + mempool_t *region_pool; + unsigned int mask; + unsigned int nr_buckets; + unsigned int prime; + unsigned int shift; + struct list_head *buckets; + + spinlock_t region_lock; + struct semaphore recovery_count; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; +}; + +struct region { + struct region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + void *context; /* Caller context. */ + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct bio_list delayed_bios; +}; + +/* + * Conversion fns + */ +region_t rh_sector_to_region(void *rh, sector_t sector) +{ + return sector >> ((struct region_hash*) rh)->region_shift; +} + +region_t rh_bio_to_region(void *rh, struct bio *bio) +{ + return rh_sector_to_region(rh, bio->bi_sector); +} + +sector_t rh_region_to_sector(void *rh, region_t region) +{ + return region << ((struct region_hash*) rh)->region_shift; +} + +/* + * Retrival fns. + */ +region_t rh_get_region_key(void *reg) +{ + return ((struct region *)reg)->key; +} + +sector_t rh_get_region_size(void *rh) +{ + return ((struct region_hash *)rh)->region_size; +} + +/* Squirrel a context with a region. */ +void *rh_reg_get_context(void *reg) +{ + return ((struct region*) reg)->context; +} + +void rh_reg_set_context(void *reg, void *context) +{ + ((struct region*) reg)->context = context; +} + +/* + * Region struct allocation/free. + */ +static void *region_alloc(unsigned int gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct region), gfp_mask); +} + +static void region_free(void *element, void *pool_data) +{ + kfree(element); +} + +#define MIN_REGIONS 64 +int rh_init(void **region_hash, + unsigned int max_recovery, + + void (*dispatch)(void *dispatch_context, struct bio_list *bios), + void *dispatch_context, + void (*wake)(void *wake_context), + void *wake_context, + struct dm_dirty_log *log, uint32_t region_size, + region_t nr_regions) +{ + unsigned int nr_buckets, max_buckets; + unsigned hash_primes[] = { + /* Table of primes for rh_hash/table size optimization. */ + 3, 7, 13, 27, 53, 97, 193, 389, 769, + 1543, 3079, 6151, 12289, 24593, + }; + size_t i; + struct region_hash *rh; + + if (region_size & (region_size - 1)) { + DMERR("region size must be 2^^n"); + return -EINVAL; + } + + rh = kmalloc(sizeof(*rh), GFP_KERNEL); + if (!rh) { + DMERR("unable to allocate region hash memory"); + return -ENOMEM; + } + + rh->max_recovery = max_recovery; + rh->dispatch = dispatch; + rh->dispatch_context = dispatch_context; + rh->wake = wake; + rh->wake_context = wake_context; + rh->log = log; + rh->region_size = region_size; + rh->region_shift = ffs(region_size) - 1; + rwlock_init(&rh->hash_lock); + + /* Calculate a suitable number of buckets for our hash table. */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1); + nr_buckets >>= 1; + rh->mask = rh->nr_buckets = nr_buckets; + rh->mask--; + rh->shift = ffs(nr_buckets); + rh->prime = hash_primes[rh->shift - 1]; + if (rh->prime > ARRAY_SIZE(hash_primes) - 2) + rh->prime = ARRAY_SIZE(hash_primes) - 1; + + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); + if (!rh->buckets) { + DMERR("unable to allocate region hash bucket memory"); + vfree(rh); + return -ENOMEM; + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + + rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, + region_free, NULL); + if (!rh->region_pool) { + vfree(rh->buckets); + vfree(rh); + return -ENOMEM; + } + + *region_hash = rh; + + return 0; +} + +void rh_exit(void *v) +{ + unsigned int h; + struct region *reg, *tmp; + struct region_hash *rh = v; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_entry_safe(reg, tmp, rh->buckets + h, hash_list) { + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, rh->region_pool); + } + } + + dm_dirty_log_destroy(rh->log); + + if (rh->region_pool) + mempool_destroy(rh->region_pool); + + vfree(rh->buckets); + kfree(rh); +} + +static inline unsigned int rh_hash(struct region_hash *rh, region_t region) +{ + return (unsigned int) ((region * rh->prime) >> rh->shift) & rh->mask; +} + +static struct region *__rh_lookup(struct region_hash *rh, region_t region) +{ + struct region *reg; + struct list_head *bucket = rh->buckets + rh_hash(rh, region); + + list_for_each_entry(reg, bucket, hash_list) { + if (reg->key == region) + return reg; + } + + return NULL; +} + +static void __rh_insert(struct region_hash *rh, struct region *reg) +{ + unsigned int h = rh_hash(rh, reg->key); + list_add(®->hash_list, rh->buckets + h); +} + +static struct region *__rh_alloc(struct region_hash *rh, region_t region) +{ + struct region *reg, *nreg; + + read_unlock(&rh->hash_lock); + + nreg = mempool_alloc(rh->region_pool, GFP_NOIO); + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + RH_CLEAN : RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + + INIT_LIST_HEAD(&nreg->list); + + atomic_set(&nreg->pending, 0); + bio_list_init(&nreg->delayed_bios); + + write_lock_irq(&rh->hash_lock); + + reg = __rh_lookup(rh, region); + if (reg) + /* we lost the race */ + mempool_free(nreg, rh->region_pool); + else { + __rh_insert(rh, nreg); + if (nreg->state == RH_CLEAN) { + spin_lock(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock(&rh->region_lock); + } + reg = nreg; + } + + write_unlock_irq(&rh->hash_lock); + read_lock(&rh->hash_lock); + + return reg; +} + +static inline struct region *__rh_find(struct region_hash *rh, region_t region) +{ + struct region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) + reg = __rh_alloc(rh, region); + + return reg; +} + +int rh_state(void *v, region_t region, int may_block) +{ + int r = 0; + struct region *reg; + struct region_hash *rh = v; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + if (reg) + r = reg->state; + read_unlock(&rh->hash_lock); + + if (r) + return r; + + /* + * The region wasn't in the hash, so we fall back to the dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a RH_NOSYNC + */ + return r == 1 ? RH_CLEAN : RH_NOSYNC; +} + +void rh_update_states(void *v) +{ + struct region *reg, *next; + struct region_hash *rh = v; + LIST_HEAD(clean); + LIST_HEAD(recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice(&rh->clean_regions, &clean); + INIT_LIST_HEAD(&rh->clean_regions); + + list_for_each_entry(reg, &clean, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice(&rh->recovered_regions, &recovered); + INIT_LIST_HEAD(&rh->recovered_regions); + + list_for_each_entry(reg, &recovered, list) + list_del(®->hash_list); + } + + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_entry_safe (reg, next, &recovered, list) { + if (reg->state != RH_ERROR) + rh->log->type->clear_region(rh->log, reg->key); + + rh->log->type->set_region_sync(rh->log, reg->key, + reg->state != RH_ERROR); + up(&rh->recovery_count); + if (reg->delayed_bios.head) + rh->dispatch(rh->dispatch_context, ®->delayed_bios); + + mempool_free(reg, rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + mempool_free(reg, rh->region_pool); + } + + rh_flush(rh); +} + +void rh_inc(void *v, region_t region) +{ + struct region *reg; + struct region_hash *rh = v; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + if (reg->state == RH_CLEAN) { + rh->log->type->mark_region(rh->log, reg->key); + + spin_lock_irq(&rh->region_lock); + reg->state = RH_DIRTY; + list_del_init(®->list); /* Take off the clean list. */ + spin_unlock_irq(&rh->region_lock); + } + + atomic_inc(®->pending); + read_unlock(&rh->hash_lock); +} + +void rh_inc_pending(void *v, struct bio_list *bios) +{ + struct bio *bio; + struct region_hash *rh = v; + + for (bio = bios->head; bio; bio = bio->bi_next) + rh_inc(rh, rh_bio_to_region(rh, bio)); +} + +void rh_dec(void *v, region_t region) +{ + unsigned long flags; + struct region *reg; + struct region_hash *rh = v; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + BUG_ON(!reg); + + if (atomic_dec_and_test(®->pending)) { + spin_lock_irqsave(&rh->region_lock, flags); + if (reg->state == RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else { + reg->state = RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + spin_unlock_irqrestore(&rh->region_lock, flags); + } +} + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct region_hash *rh) +{ + int r; + struct region *reg; + region_t region; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing + * by setting the recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + + reg->state = RH_RECOVERING; + + /* Already quiesced ? */ + list_del_init(®->list); + if (!atomic_read(®->pending)) + list_add(®->list, &rh->quiesced_regions); + + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +int rh_recovery_prepare(void *v) +{ + struct region_hash *rh = v; + + if (test_bit(RECOVERY, &rh->flags)) { + while (!down_trylock(&rh->recovery_count)) { + if (__rh_recovery_prepare(rh) <= 0) { + up(&rh->recovery_count); + return -ENOENT; + } + } + } + + return 0; +} + +/* + * Returns any quiesced regions. + */ +void *rh_recovery_start(void *v) +{ + struct region *reg = NULL; + struct region_hash *rh = v; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct region, list); + list_del_init(®->list); /* Remove from the quiesced list. */ + } + spin_unlock_irq(&rh->region_lock); + + return (void*) reg; +} + +/* + * Put region on list of recovered ones. + */ +void rh_recovery_end(void *v, int error) +{ + struct region *reg = v; + struct region_hash *rh = reg->rh; + + if (error) + reg->state = RH_ERROR; + + spin_lock_irq(&rh->region_lock); + list_add(®->list, &rh->recovered_regions); + spin_unlock_irq(&rh->region_lock); +} + +void rh_flush(void *v) +{ + struct region_hash *rh = v; + + rh->log->type->flush(rh->log); +} + +void rh_delay_by_region(void *v, struct bio *bio, region_t region) +{ + struct region_hash *rh = v; + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + bio_list_add(®->delayed_bios, bio); + read_unlock(&rh->hash_lock); +} + +void rh_delay(void *v, struct bio *bio) +{ + return rh_delay_by_region(v, bio, rh_bio_to_region(v, bio)); +} + +void rh_stop_recovery(void *v) +{ + int i; + struct region_hash *rh = v; + + clear_bit(RECOVERY, &rh->flags); + rh->wake(rh->wake_context); + + /* wait for any recovering regions */ + for (i = 0; i < rh->max_recovery; i++) + down(&rh->recovery_count); +} + +void rh_start_recovery(void *v) +{ + int i; + struct region_hash *rh = v; + + set_bit(RECOVERY, &rh->flags); + for (i = 0; i < rh->max_recovery; i++) + up(&rh->recovery_count); + + rh->wake(rh->wake_context); +} + +EXPORT_SYMBOL(rh_bio_to_region); +EXPORT_SYMBOL(rh_sector_to_region); +EXPORT_SYMBOL(rh_region_to_sector); +EXPORT_SYMBOL(rh_init); +EXPORT_SYMBOL(rh_exit); +EXPORT_SYMBOL(rh_state); +EXPORT_SYMBOL(rh_update_states); +EXPORT_SYMBOL(rh_flush); +EXPORT_SYMBOL(rh_inc); +EXPORT_SYMBOL(rh_inc_pending); +EXPORT_SYMBOL(rh_dec); +EXPORT_SYMBOL(rh_delay); +EXPORT_SYMBOL(rh_delay_by_region); +EXPORT_SYMBOL(rh_recovery_prepare); +EXPORT_SYMBOL(rh_recovery_start); +EXPORT_SYMBOL(rh_recovery_end); +EXPORT_SYMBOL(rh_stop_recovery); +EXPORT_SYMBOL(rh_start_recovery); +EXPORT_SYMBOL(rh_reg_get_context); +EXPORT_SYMBOL(rh_reg_set_context); +EXPORT_SYMBOL(rh_get_region_key); +EXPORT_SYMBOL(rh_get_region_size); + +MODULE_DESCRIPTION(DM_NAME " region hash"); +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>"); +MODULE_LICENSE("GPL");