Experimental version of the snapshot and mirror targets for 2.6.
--- diff/drivers/md/Kconfig	2003-11-26 10:09:05.000000000 +0000
+++ source/drivers/md/Kconfig	2003-11-26 10:18:32.000000000 +0000
@@ -143,5 +143,18 @@
 	  Recent tools use a new version of the ioctl interface, only
           select this option if you intend using such tools.
 
+config DM_SNAPSHOT
+	tristate "Snapshot target (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	  Allow volume managers to take writeable snapshots of a device.
+
+config DM_MIRROR
+	tristate "Mirror target (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	  Allow volume managers to mirror logical volumes, also
+          needed for live data migration tools such as 'pvmove'.
+
 endmenu
 
--- diff/drivers/md/Makefile	2003-02-13 11:46:52.000000000 +0000
+++ source/drivers/md/Makefile	2003-11-26 10:18:32.000000000 +0000
@@ -3,7 +3,9 @@
 #
 
 dm-mod-objs	:= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
-		   dm-ioctl.o
+		   dm-ioctl.o dm-io.o kcopyd.o dm-daemon.o
+
+dm-mirror-objs	:= dm-log.o dm-raid1.o
 
 # Note: link order is important.  All raid personalities
 # and xor.o must come before md.o, as they each initialise 
@@ -17,3 +19,5 @@
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
+obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o dm-exception-store.o
+obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o
--- diff/drivers/md/dm-linear.c	2003-09-30 15:46:14.000000000 +0100
+++ source/drivers/md/dm-linear.c	2003-11-26 10:18:32.000000000 +0000
@@ -65,7 +65,8 @@
 	kfree(lc);
 }
 
-static int linear_map(struct dm_target *ti, struct bio *bio)
+static int linear_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
 
--- diff/drivers/md/dm-stripe.c	2003-09-30 15:46:14.000000000 +0100
+++ source/drivers/md/dm-stripe.c	2003-11-26 10:18:32.000000000 +0000
@@ -166,7 +166,8 @@
 	kfree(sc);
 }
 
-static int stripe_map(struct dm_target *ti, struct bio *bio)
+static int stripe_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
 {
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
 
--- diff/drivers/md/dm-table.c	2003-11-26 10:09:05.000000000 +0000
+++ source/drivers/md/dm-table.c	2003-11-26 10:18:32.000000000 +0000
@@ -149,7 +149,7 @@
 	return 0;
 }
 
-static void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
+void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
 {
 	unsigned long size;
 	void *addr;
--- diff/drivers/md/dm-target.c	2003-06-30 10:07:21.000000000 +0100
+++ source/drivers/md/dm-target.c	2003-11-26 10:18:32.000000000 +0000
@@ -157,7 +157,8 @@
 	/* empty */
 }
 
-static int io_err_map(struct dm_target *ti, struct bio *bio)
+static int io_err_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
 {
 	return -EIO;
 }
--- diff/drivers/md/dm.c	2003-11-26 10:09:05.000000000 +0000
+++ source/drivers/md/dm.c	2003-11-26 10:18:32.000000000 +0000
@@ -6,6 +6,9 @@
 
 #include "dm.h"
 
+// FIXME: remove this
+#include "dm-log.h"
+
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -20,6 +23,9 @@
 static unsigned int major = 0;
 static unsigned int _major = 0;
 
+/*
+ * One of these is allocated per bio.
+ */
 struct dm_io {
 	struct mapped_device *md;
 	int error;
@@ -27,6 +33,16 @@
 	atomic_t io_count;
 };
 
+/*
+ * One of these is allocated per target within a bio.  Hopefully
+ * this will be simplified out one day.
+ */
+struct target_io {
+	struct dm_io *io;
+	struct dm_target *ti;
+	union map_info info;
+};
+
 struct deferred_io {
 	struct bio *bio;
 	struct deferred_io *next;
@@ -63,6 +79,7 @@
 	 * io objects are allocated from here.
 	 */
 	mempool_t *io_pool;
+	mempool_t *tio_pool;
 
 	/*
 	 * Event handling.
@@ -73,6 +90,7 @@
 
 #define MIN_IOS 256
 static kmem_cache_t *_io_cache;
+static kmem_cache_t *_tio_cache;
 
 static __init int local_init(void)
 {
@@ -84,9 +102,18 @@
 	if (!_io_cache)
 		return -ENOMEM;
 
+	/* allocate a slab for the target ios */
+	_tio_cache = kmem_cache_create("dm tio", sizeof(struct target_io),
+				       0, 0, NULL, NULL);
+	if (!_tio_cache) {
+		kmem_cache_destroy(_io_cache);
+		return -ENOMEM;
+	}
+
 	_major = major;
 	r = register_blkdev(_major, _name);
 	if (r < 0) {
+		kmem_cache_destroy(_tio_cache);
 		kmem_cache_destroy(_io_cache);
 		return r;
 	}
@@ -99,6 +126,7 @@
 
 static void local_exit(void)
 {
+	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
 
 	if (unregister_blkdev(_major, _name) < 0)
@@ -124,6 +152,7 @@
 	xx(dm_target)
 	xx(dm_linear)
 	xx(dm_stripe)
+	xx(kcopyd)
 	xx(dm_interface)
 #undef xx
 };
@@ -184,6 +213,16 @@
 	mempool_free(io, md->io_pool);
 }
 
+static inline struct target_io *alloc_tio(struct mapped_device *md)
+{
+	return mempool_alloc(md->tio_pool, GFP_NOIO);
+}
+
+static inline void free_tio(struct mapped_device *md, struct target_io *tio)
+{
+	mempool_free(tio, md->tio_pool);
+}
+
 static inline struct deferred_io *alloc_deferred(void)
 {
 	return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
@@ -229,15 +268,6 @@
  *   interests of getting something for people to use I give
  *   you this clearly demarcated crap.
  *---------------------------------------------------------------*/
-static inline sector_t to_sector(unsigned int bytes)
-{
-	return bytes >> SECTOR_SHIFT;
-}
-
-static inline unsigned int to_bytes(sector_t sector)
-{
-	return sector << SECTOR_SHIFT;
-}
 
 /*
  * Decrements the number of outstanding ios that a bio has been
@@ -266,17 +296,30 @@
 
 static int clone_endio(struct bio *bio, unsigned int done, int error)
 {
-	struct dm_io *io = bio->bi_private;
+	int r = 0;
+	struct target_io *tio = bio->bi_private;
+	struct dm_io *io = tio->io;
+	dm_endio_fn endio = tio->ti->type->end_io;
 
 	if (bio->bi_size)
 		return 1;
 
+	if (endio) {
+		r = endio(tio->ti, bio, error, &tio->info);
+		if (r < 0)
+			error = r;
+
+		else if (r > 0)
+			/* the target wants another shot at the io */
+			return 1; /* FIXME: do we need to reset bio at all ? */
+	}
+
+	free_tio(io->md, tio);
 	dec_pending(io, error);
 	bio_put(bio);
-	return 0;
+	return r;
 }
 
-
 static sector_t max_io_len(struct mapped_device *md,
 			   sector_t sector, struct dm_target *ti)
 {
@@ -297,7 +340,8 @@
 	return len;
 }
 
-static void __map_bio(struct dm_target *ti, struct bio *clone, struct dm_io *io)
+static void __map_bio(struct dm_target *ti, struct bio *clone,
+		      struct target_io *tio)
 {
 	int r;
 
@@ -307,22 +351,24 @@
 	BUG_ON(!clone->bi_size);
 
 	clone->bi_end_io = clone_endio;
-	clone->bi_private = io;
+	clone->bi_private = tio;
 
 	/*
 	 * Map the clone.  If r == 0 we don't need to do
 	 * anything, the target has assumed ownership of
 	 * this io.
 	 */
-	atomic_inc(&io->io_count);
-	r = ti->type->map(ti, clone);
+	atomic_inc(&tio->io->io_count);
+	r = ti->type->map(ti, clone, &tio->info);
 	if (r > 0)
 		/* the bio has been remapped so dispatch it */
 		generic_make_request(clone);
 
-	else if (r < 0)
+	else if (r < 0) {
 		/* error the io and bail out */
-		dec_pending(io, -EIO);
+		free_tio(tio->io->md, tio);
+		dec_pending(tio->io, -EIO);
+	}
 }
 
 struct clone_info {
@@ -381,6 +427,15 @@
 	struct bio *clone, *bio = ci->bio;
 	struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
 	sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
+	struct target_io *tio;
+
+	/*
+	 * Allocate a target io object.
+	 */
+	tio = alloc_tio(ci->md);
+	tio->io = ci->io;
+	tio->ti = ti;
+	memset(&tio->info, 0, sizeof(tio->info));
 
 	if (ci->sector_count <= max) {
 		/*
@@ -389,7 +444,7 @@
 		 */
 		clone = clone_bio(bio, ci->sector, ci->idx,
 				  bio->bi_vcnt - ci->idx, ci->sector_count);
-		__map_bio(ti, clone, ci->io);
+		__map_bio(ti, clone, tio);
 		ci->sector_count = 0;
 
 	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
@@ -412,7 +467,7 @@
 		}
 
 		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
-		__map_bio(ti, clone, ci->io);
+		__map_bio(ti, clone, tio);
 
 		ci->sector += len;
 		ci->sector_count -= len;
@@ -427,7 +482,7 @@
 
 		clone = split_bvec(bio, ci->sector, ci->idx,
 				   bv->bv_offset, max);
-		__map_bio(ti, clone, ci->io);
+		__map_bio(ti, clone, tio);
 
 		ci->sector += max;
 		ci->sector_count -= max;
@@ -436,7 +491,7 @@
 		len = to_sector(bv->bv_len) - max;
 		clone = split_bvec(bio, ci->sector, ci->idx,
 				   bv->bv_offset + to_bytes(max), len);
-		__map_bio(ti, clone, ci->io);
+		__map_bio(ti, clone, tio);
 
 		ci->sector += len;
 		ci->sector_count -= len;
@@ -588,41 +643,33 @@
 
 	/* get a minor number for the dev */
 	r = persistent ? specific_minor(minor) : next_free_minor(&minor);
-	if (r < 0) {
-		kfree(md);
-		return NULL;
-	}
+	if (r < 0)
+		goto bad1;
 
 	memset(md, 0, sizeof(*md));
 	init_rwsem(&md->lock);
 	atomic_set(&md->holders, 1);
 
 	md->queue = blk_alloc_queue(GFP_KERNEL);
-	if (!md->queue) {
-		kfree(md);
-		return NULL;
-	}
+	if (!md->queue)
+		goto bad1;
 
 	md->queue->queuedata = md;
 	blk_queue_make_request(md->queue, dm_request);
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
-	if (!md->io_pool) {
-		free_minor(minor);
-		blk_put_queue(md->queue);
-		kfree(md);
-		return NULL;
-	}
+ 	if (!md->io_pool)
+ 		goto bad2;
+
+	md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+				      mempool_free_slab, _tio_cache);
+	if (!md->tio_pool)
+		goto bad3;
 
 	md->disk = alloc_disk(1);
-	if (!md->disk) {
-		mempool_destroy(md->io_pool);
-		free_minor(minor);
-		blk_put_queue(md->queue);
-		kfree(md);
-		return NULL;
-	}
+	if (!md->disk)
+		goto bad4;
 
 	md->disk->major = _major;
 	md->disk->first_minor = minor;
@@ -637,11 +684,22 @@
 	init_waitqueue_head(&md->eventq);
 
 	return md;
+
+ bad4:
+	mempool_destroy(md->tio_pool);
+ bad3:
+	mempool_destroy(md->io_pool);
+ bad2:
+	free_minor(minor);
+ bad1:
+	kfree(md);
+	return NULL;
 }
 
 static void free_dev(struct mapped_device *md)
 {
 	free_minor(md->disk->first_minor);
+	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
 	del_gendisk(md->disk);
 	put_disk(md->disk);
--- diff/drivers/md/dm.h	2003-11-26 10:09:05.000000000 +0000
+++ source/drivers/md/dm.h	2003-11-26 10:18:32.000000000 +0000
@@ -151,6 +151,16 @@
 	return dm_round_up(n, size) / size;
 }
 
+static inline sector_t to_sector(unsigned long n)
+{
+	return (n >> 9);
+}
+
+static inline unsigned long to_bytes(sector_t n)
+{
+	return (n << 9);
+}
+
 /*
  * The device-mapper can be driven through one of two interfaces;
  * ioctl or filesystem, depending which patch you have applied.
@@ -167,4 +177,9 @@
 int dm_stripe_init(void);
 void dm_stripe_exit(void);
 
+int kcopyd_init(void);
+void kcopyd_exit(void);
+
+void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
+
 #endif
--- diff/include/linux/device-mapper.h	2003-06-30 10:07:24.000000000 +0100
+++ source/include/linux/device-mapper.h	2003-11-26 10:18:32.000000000 +0000
@@ -13,6 +13,11 @@
 
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
+union map_info {
+	void *ptr;
+	unsigned long long ll;
+};
+
 /*
  * In the constructor the target parameter will already have the
  * table, type, begin and len fields filled in.
@@ -32,7 +37,19 @@
  * = 0: The target will handle the io by resubmitting it later
  * > 0: simple remap complete
  */
-typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio);
+typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
+			  union map_info *map_context);
+
+/*
+ * Returns:
+ * < 0 : error (currently ignored)
+ * 0   : ended successfully
+ * 1   : for some reason the io has still not completed (eg,
+ *       multipath target might want to requeue a failed io).
+ */
+typedef int (*dm_endio_fn) (struct dm_target *ti,
+			    struct bio *bio, int error,
+			    union map_info *map_context);
 
 typedef void (*dm_suspend_fn) (struct dm_target *ti);
 typedef void (*dm_resume_fn) (struct dm_target *ti);
@@ -60,6 +77,7 @@
 	dm_ctr_fn ctr;
 	dm_dtr_fn dtr;
 	dm_map_fn map;
+	dm_endio_fn end_io;
 	dm_suspend_fn suspend;
 	dm_resume_fn resume;
 	dm_status_fn status;
--- diff/mm/mempool.c	2003-01-13 14:18:16.000000000 +0000
+++ source/mm/mempool.c	2003-11-26 10:18:32.000000000 +0000
@@ -89,11 +89,6 @@
 }
 EXPORT_SYMBOL(mempool_create);
 
-/*
- * mempool_resize is disabled for now, because it has no callers.  Feel free
- * to turn it back on if needed.
- */
-#if 0
 /**
  * mempool_resize - resize an existing memory pool
  * @pool:       pointer to the memory pool which was allocated via
@@ -163,7 +158,6 @@
 	return 0;
 }
 EXPORT_SYMBOL(mempool_resize);
-#endif
 
 /**
  * mempool_destroy - deallocate a memory pool
--- diff/drivers/md/dm-daemon.c	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-daemon.c	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#include "dm.h"
+#include "dm-daemon.h"
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/suspend.h>
+
+static int daemon(void *arg)
+{
+	struct dm_daemon *dd = (struct dm_daemon *) arg;
+	DECLARE_WAITQUEUE(wq, current);
+
+	daemonize("%s", dd->name);
+
+	atomic_set(&dd->please_die, 0);
+
+	add_wait_queue(&dd->job_queue, &wq);
+
+	down(&dd->run_lock);
+	up(&dd->start_lock);
+
+	/*
+	 * dd->fn() could do anything, very likely it will
+	 * suspend.  So we can't set the state to
+	 * TASK_INTERRUPTIBLE before calling it.  In order to
+	 * prevent a race with a waking thread we do this little
+	 * dance with the dd->woken variable.
+	 */
+	while (1) {
+		if (atomic_read(&dd->please_die))
+			goto out;
+
+#if 0
+		/* FIXME: not convinced by this */
+		if (current->flags & PF_FREEZE)
+			refrigerator(PF_IOTHREAD);
+#endif
+
+		do {
+			set_current_state(TASK_RUNNING);
+			atomic_set(&dd->woken, 0);
+			dd->fn();
+			set_current_state(TASK_INTERRUPTIBLE);
+
+		} while (atomic_read(&dd->woken));
+
+		schedule();
+	}
+
+ out:
+	remove_wait_queue(&dd->job_queue, &wq);
+	up(&dd->run_lock);
+	return 0;
+}
+
+int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
+{
+	pid_t pid = 0;
+
+	/*
+	 * Initialise the dm_daemon.
+	 */
+	dd->fn = fn;
+	strncpy(dd->name, name, sizeof(dd->name) - 1);
+	sema_init(&dd->start_lock, 1);
+	sema_init(&dd->run_lock, 1);
+	init_waitqueue_head(&dd->job_queue);
+
+	/*
+	 * Start the new thread.
+	 */
+	down(&dd->start_lock);
+	pid = kernel_thread(daemon, dd, CLONE_KERNEL);
+	if (pid <= 0) {
+		DMERR("Failed to start %s thread", name);
+		return -EAGAIN;
+	}
+
+	/*
+	 * wait for the daemon to up this mutex.
+	 */
+	down(&dd->start_lock);
+	up(&dd->start_lock);
+
+	return 0;
+}
+
+void dm_daemon_stop(struct dm_daemon *dd)
+{
+	atomic_set(&dd->please_die, 1);
+	dm_daemon_wake(dd);
+	down(&dd->run_lock);
+	up(&dd->run_lock);
+}
+
+void dm_daemon_wake(struct dm_daemon *dd)
+{
+	atomic_set(&dd->woken, 1);
+	wake_up_interruptible(&dd->job_queue);
+}
+
+EXPORT_SYMBOL(dm_daemon_start);
+EXPORT_SYMBOL(dm_daemon_stop);
+EXPORT_SYMBOL(dm_daemon_wake);
--- diff/drivers/md/dm-daemon.h	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-daemon.h	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_DAEMON_H
+#define DM_DAEMON_H
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+struct dm_daemon {
+	void (*fn)(void);
+	char name[16];
+	atomic_t please_die;
+	struct semaphore start_lock;
+	struct semaphore run_lock;
+
+	atomic_t woken;
+	wait_queue_head_t job_queue;
+};
+
+int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
+void dm_daemon_stop(struct dm_daemon *dd);
+void dm_daemon_wake(struct dm_daemon *dd);
+int dm_daemon_running(struct dm_daemon *dd);
+
+#endif
--- diff/drivers/md/dm-exception-store.c	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-exception-store.c	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,674 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-snapshot.h"
+#include "dm-io.h"
+#include "kcopyd.h"
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+/*-----------------------------------------------------------------
+ * Persistent snapshots, by persistent we mean that the snapshot
+ * will survive a reboot.
+ *---------------------------------------------------------------*/
+
+/*
+ * We need to store a record of which parts of the origin have
+ * been copied to the snapshot device.  The snapshot code
+ * requires that we copy exception chunks to chunk aligned areas
+ * of the COW store.  It makes sense therefore, to store the
+ * metadata in chunk size blocks.
+ *
+ * There is no backward or forward compatibility implemented,
+ * snapshots with different disk versions than the kernel will
+ * not be usable.  It is expected that "lvcreate" will blank out
+ * the start of a fresh COW device before calling the snapshot
+ * constructor.
+ *
+ * The first chunk of the COW device just contains the header.
+ * After this there is a chunk filled with exception metadata,
+ * followed by as many exception chunks as can fit in the
+ * metadata areas.
+ *
+ * All on disk structures are in little-endian format.  The end
+ * of the exceptions info is indicated by an exception with a
+ * new_chunk of 0, which is invalid since it would point to the
+ * header chunk.
+ */
+
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+
+/*
+ * The on-disk version of the metadata.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+
+struct disk_header {
+	uint32_t magic;
+
+	/*
+	 * Is this snapshot valid.  There is no way of recovering
+	 * an invalid snapshot.
+	 */
+	uint32_t valid;
+
+	/*
+	 * Simple, incrementing version. no backward
+	 * compatibility.
+	 */
+	uint32_t version;
+
+	/* In sectors */
+	uint32_t chunk_size;
+};
+
+struct disk_exception {
+	uint64_t old_chunk;
+	uint64_t new_chunk;
+};
+
+struct commit_callback {
+	void (*callback)(void *, int success);
+	void *context;
+};
+
+/*
+ * The top level structure for a persistent exception store.
+ */
+struct pstore {
+	struct dm_snapshot *snap;	/* up pointer to my snapshot */
+	int version;
+	int valid;
+	uint32_t chunk_size;
+	uint32_t exceptions_per_area;
+
+	/*
+	 * Now that we have an asynchronous kcopyd there is no
+	 * need for large chunk sizes, so it wont hurt to have a
+	 * whole chunks worth of metadata in memory at once.
+	 */
+	void *area;
+
+	/*
+	 * Used to keep track of which metadata area the data in
+	 * 'chunk' refers to.
+	 */
+	uint32_t current_area;
+
+	/*
+	 * The next free chunk for an exception.
+	 */
+	uint32_t next_free;
+
+	/*
+	 * The index of next free exception in the current
+	 * metadata area.
+	 */
+	uint32_t current_committed;
+
+	atomic_t pending_count;
+	uint32_t callback_count;
+	struct commit_callback *callbacks;
+};
+
+static inline unsigned int sectors_to_pages(unsigned int sectors)
+{
+	return sectors / (PAGE_SIZE >> 9);
+}
+
+static int alloc_area(struct pstore *ps)
+{
+	int r = -ENOMEM;
+	size_t i, len, nr_pages;
+	struct page *page, *last = NULL;
+
+	len = ps->chunk_size << SECTOR_SHIFT;
+
+	/*
+	 * Allocate the chunk_size block of memory that will hold
+	 * a single metadata area.
+	 */
+	ps->area = vmalloc(len);
+	if (!ps->area)
+		return r;
+
+	nr_pages = sectors_to_pages(ps->chunk_size);
+
+	/*
+	 * We lock the pages for ps->area into memory since
+	 * they'll be doing a lot of io.  We also chain them
+	 * together ready for dm-io.
+	 */
+	for (i = 0; i < nr_pages; i++) {
+		page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
+		SetPageLocked(page);
+		if (last)
+			last->list.next = &page->list;
+		last = page;
+	}
+
+	return 0;
+}
+
+static void free_area(struct pstore *ps)
+{
+	size_t i, nr_pages;
+	struct page *page;
+
+	nr_pages = sectors_to_pages(ps->chunk_size);
+	for (i = 0; i < nr_pages; i++) {
+		page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
+		page->list.next = NULL;
+		ClearPageLocked(page);
+	}
+
+	vfree(ps->area);
+}
+
+/*
+ * Read or write a chunk aligned and sized block of data from a device.
+ */
+static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
+{
+	struct io_region where;
+	unsigned long bits;
+
+	where.bdev = ps->snap->cow->bdev;
+	where.sector = ps->chunk_size * chunk;
+	where.count = ps->chunk_size;
+
+	return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits);
+}
+
+/*
+ * Read or write a metadata area.  Remembering to skip the first
+ * chunk which holds the header.
+ */
+static int area_io(struct pstore *ps, uint32_t area, int rw)
+{
+	int r;
+	uint32_t chunk;
+
+	/* convert a metadata area index to a chunk index */
+	chunk = 1 + ((ps->exceptions_per_area + 1) * area);
+
+	r = chunk_io(ps, chunk, rw);
+	if (r)
+		return r;
+
+	ps->current_area = area;
+	return 0;
+}
+
+static int zero_area(struct pstore *ps, uint32_t area)
+{
+	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
+	return area_io(ps, area, WRITE);
+}
+
+static int read_header(struct pstore *ps, int *new_snapshot)
+{
+	int r;
+	struct disk_header *dh;
+
+	r = chunk_io(ps, 0, READ);
+	if (r)
+		return r;
+
+	dh = (struct disk_header *) ps->area;
+
+	if (le32_to_cpu(dh->magic) == 0) {
+		*new_snapshot = 1;
+
+	} else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
+		*new_snapshot = 0;
+		ps->valid = le32_to_cpu(dh->valid);
+		ps->version = le32_to_cpu(dh->version);
+		ps->chunk_size = le32_to_cpu(dh->chunk_size);
+
+	} else {
+		DMWARN("Invalid/corrupt snapshot");
+		r = -ENXIO;
+	}
+
+	return r;
+}
+
+static int write_header(struct pstore *ps)
+{
+	struct disk_header *dh;
+
+	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
+
+	dh = (struct disk_header *) ps->area;
+	dh->magic = cpu_to_le32(SNAP_MAGIC);
+	dh->valid = cpu_to_le32(ps->valid);
+	dh->version = cpu_to_le32(ps->version);
+	dh->chunk_size = cpu_to_le32(ps->chunk_size);
+
+	return chunk_io(ps, 0, WRITE);
+}
+
+/*
+ * Access functions for the disk exceptions, these do the endian conversions.
+ */
+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+{
+	if (index >= ps->exceptions_per_area)
+		return NULL;
+
+	return ((struct disk_exception *) ps->area) + index;
+}
+
+static int read_exception(struct pstore *ps,
+			  uint32_t index, struct disk_exception *result)
+{
+	struct disk_exception *e;
+
+	e = get_exception(ps, index);
+	if (!e)
+		return -EINVAL;
+
+	/* copy it */
+	result->old_chunk = le64_to_cpu(e->old_chunk);
+	result->new_chunk = le64_to_cpu(e->new_chunk);
+
+	return 0;
+}
+
+static int write_exception(struct pstore *ps,
+			   uint32_t index, struct disk_exception *de)
+{
+	struct disk_exception *e;
+
+	e = get_exception(ps, index);
+	if (!e)
+		return -EINVAL;
+
+	/* copy it */
+	e->old_chunk = cpu_to_le64(de->old_chunk);
+	e->new_chunk = cpu_to_le64(de->new_chunk);
+
+	return 0;
+}
+
+/*
+ * Registers the exceptions that are present in the current area.
+ * 'full' is filled in to indicate if the area has been
+ * filled.
+ */
+static int insert_exceptions(struct pstore *ps, int *full)
+{
+	int r;
+	unsigned int i;
+	struct disk_exception de;
+
+	/* presume the area is full */
+	*full = 1;
+
+	for (i = 0; i < ps->exceptions_per_area; i++) {
+		r = read_exception(ps, i, &de);
+
+		if (r)
+			return r;
+
+		/*
+		 * If the new_chunk is pointing at the start of
+		 * the COW device, where the first metadata area
+		 * is we know that we've hit the end of the
+		 * exceptions.  Therefore the area is not full.
+		 */
+		if (de.new_chunk == 0LL) {
+			ps->current_committed = i;
+			*full = 0;
+			break;
+		}
+
+		/*
+		 * Keep track of the start of the free chunks.
+		 */
+		if (ps->next_free <= de.new_chunk)
+			ps->next_free = de.new_chunk + 1;
+
+		/*
+		 * Otherwise we add the exception to the snapshot.
+		 */
+		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int read_exceptions(struct pstore *ps)
+{
+	uint32_t area;
+	int r, full = 1;
+
+	/*
+	 * Keeping reading chunks and inserting exceptions until
+	 * we find a partially full area.
+	 */
+	for (area = 0; full; area++) {
+		r = area_io(ps, area, READ);
+		if (r)
+			return r;
+
+		r = insert_exceptions(ps, &full);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static inline struct pstore *get_info(struct exception_store *store)
+{
+	return (struct pstore *) store->context;
+}
+
+static void persistent_fraction_full(struct exception_store *store,
+				     sector_t *numerator, sector_t *denominator)
+{
+	*numerator = get_info(store)->next_free * store->snap->chunk_size;
+	*denominator = get_dev_size(store->snap->cow->bdev);
+}
+
+static void persistent_destroy(struct exception_store *store)
+{
+	struct pstore *ps = get_info(store);
+
+	dm_io_put(sectors_to_pages(ps->chunk_size));
+	vfree(ps->callbacks);
+	free_area(ps);
+	kfree(ps);
+}
+
+static int persistent_read_metadata(struct exception_store *store)
+{
+	int r, new_snapshot;
+	struct pstore *ps = get_info(store);
+
+	/*
+	 * Read the snapshot header.
+	 */
+	r = read_header(ps, &new_snapshot);
+	if (r)
+		return r;
+
+	/*
+	 * Do we need to setup a new snapshot ?
+	 */
+	if (new_snapshot) {
+		r = write_header(ps);
+		if (r) {
+			DMWARN("write_header failed");
+			return r;
+		}
+
+		r = zero_area(ps, 0);
+		if (r) {
+			DMWARN("zero_area(0) failed");
+			return r;
+		}
+
+	} else {
+		/*
+		 * Sanity checks.
+		 */
+		if (!ps->valid) {
+			DMWARN("snapshot is marked invalid");
+			return -EINVAL;
+		}
+
+		if (ps->version != SNAPSHOT_DISK_VERSION) {
+			DMWARN("unable to handle snapshot disk version %d",
+			       ps->version);
+			return -EINVAL;
+		}
+
+		/*
+		 * Read the metadata.
+		 */
+		r = read_exceptions(ps);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int persistent_prepare(struct exception_store *store,
+			      struct exception *e)
+{
+	struct pstore *ps = get_info(store);
+	uint32_t stride;
+	sector_t size = get_dev_size(store->snap->cow->bdev);
+
+	/* Is there enough room ? */
+	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
+		return -ENOSPC;
+
+	e->new_chunk = ps->next_free;
+
+	/*
+	 * Move onto the next free pending, making sure to take
+	 * into account the location of the metadata chunks.
+	 */
+	stride = (ps->exceptions_per_area + 1);
+	if ((++ps->next_free % stride) == 1)
+		ps->next_free++;
+
+	atomic_inc(&ps->pending_count);
+	return 0;
+}
+
+static void persistent_commit(struct exception_store *store,
+			      struct exception *e,
+			      void (*callback) (void *, int success),
+			      void *callback_context)
+{
+	int r;
+	unsigned int i;
+	struct pstore *ps = get_info(store);
+	struct disk_exception de;
+	struct commit_callback *cb;
+
+	de.old_chunk = e->old_chunk;
+	de.new_chunk = e->new_chunk;
+	write_exception(ps, ps->current_committed++, &de);
+
+	/*
+	 * Add the callback to the back of the array.  This code
+	 * is the only place where the callback array is
+	 * manipulated, and we know that it will never be called
+	 * multiple times concurrently.
+	 */
+	cb = ps->callbacks + ps->callback_count++;
+	cb->callback = callback;
+	cb->context = callback_context;
+
+	/*
+	 * If there are no more exceptions in flight, or we have
+	 * filled this metadata area we commit the exceptions to
+	 * disk.
+	 */
+	if (atomic_dec_and_test(&ps->pending_count) ||
+	    (ps->current_committed == ps->exceptions_per_area)) {
+		r = area_io(ps, ps->current_area, WRITE);
+		if (r)
+			ps->valid = 0;
+
+		for (i = 0; i < ps->callback_count; i++) {
+			cb = ps->callbacks + i;
+			cb->callback(cb->context, r == 0 ? 1 : 0);
+		}
+
+		ps->callback_count = 0;
+	}
+
+	/*
+	 * Have we completely filled the current area ?
+	 */
+	if (ps->current_committed == ps->exceptions_per_area) {
+		ps->current_committed = 0;
+		r = zero_area(ps, ps->current_area + 1);
+		if (r)
+			ps->valid = 0;
+	}
+}
+
+static void persistent_drop(struct exception_store *store)
+{
+	struct pstore *ps = get_info(store);
+
+	ps->valid = 0;
+	if (write_header(ps))
+		DMWARN("write header failed");
+}
+
+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
+{
+	int r;
+	struct pstore *ps;
+
+	r = dm_io_get(sectors_to_pages(chunk_size));
+	if (r)
+		return r;
+
+	/* allocate the pstore */
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps) {
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	ps->snap = store->snap;
+	ps->valid = 1;
+	ps->version = SNAPSHOT_DISK_VERSION;
+	ps->chunk_size = chunk_size;
+	ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
+	    sizeof(struct disk_exception);
+	ps->next_free = 2;	/* skipping the header and first area */
+	ps->current_committed = 0;
+
+	r = alloc_area(ps);
+	if (r)
+		goto bad;
+
+	/*
+	 * Allocate space for all the callbacks.
+	 */
+	ps->callback_count = 0;
+	atomic_set(&ps->pending_count, 0);
+	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
+				   sizeof(*ps->callbacks));
+
+	if (!ps->callbacks) {
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	store->destroy = persistent_destroy;
+	store->read_metadata = persistent_read_metadata;
+	store->prepare_exception = persistent_prepare;
+	store->commit_exception = persistent_commit;
+	store->drop_snapshot = persistent_drop;
+	store->fraction_full = persistent_fraction_full;
+	store->context = ps;
+
+	return 0;
+
+      bad:
+	dm_io_put(sectors_to_pages(chunk_size));
+	if (ps) {
+		if (ps->callbacks)
+			vfree(ps->callbacks);
+
+		kfree(ps);
+	}
+	return r;
+}
+
+/*-----------------------------------------------------------------
+ * Implementation of the store for non-persistent snapshots.
+ *---------------------------------------------------------------*/
+struct transient_c {
+	sector_t next_free;
+};
+
+void transient_destroy(struct exception_store *store)
+{
+	kfree(store->context);
+}
+
+int transient_read_metadata(struct exception_store *store)
+{
+	return 0;
+}
+
+int transient_prepare(struct exception_store *store, struct exception *e)
+{
+	struct transient_c *tc = (struct transient_c *) store->context;
+	sector_t size = get_dev_size(store->snap->cow->bdev);
+
+	if (size < (tc->next_free + store->snap->chunk_size))
+		return -1;
+
+	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
+	tc->next_free += store->snap->chunk_size;
+
+	return 0;
+}
+
+void transient_commit(struct exception_store *store,
+		      struct exception *e,
+		      void (*callback) (void *, int success),
+		      void *callback_context)
+{
+	/* Just succeed */
+	callback(callback_context, 1);
+}
+
+static void transient_fraction_full(struct exception_store *store,
+				    sector_t *numerator, sector_t *denominator)
+{
+	*numerator = ((struct transient_c *) store->context)->next_free;
+	*denominator = get_dev_size(store->snap->cow->bdev);
+}
+
+int dm_create_transient(struct exception_store *store,
+			struct dm_snapshot *s, int blocksize)
+{
+	struct transient_c *tc;
+
+	memset(store, 0, sizeof(*store));
+	store->destroy = transient_destroy;
+	store->read_metadata = transient_read_metadata;
+	store->prepare_exception = transient_prepare;
+	store->commit_exception = transient_commit;
+	store->fraction_full = transient_fraction_full;
+	store->snap = s;
+
+	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+	if (!tc)
+		return -ENOMEM;
+
+	tc->next_free = 0;
+	store->context = tc;
+
+	return 0;
+}
--- diff/drivers/md/dm-io.c	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-io.c	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,580 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-io.h"
+
+#include <linux/bio.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#define BIO_POOL_SIZE 256
+
+
+/*-----------------------------------------------------------------
+ * Bio set, move this to bio.c
+ *---------------------------------------------------------------*/
+#define BV_NAME_SIZE 16
+struct biovec_pool {
+	int nr_vecs;
+	char name[BV_NAME_SIZE];
+	kmem_cache_t *slab;
+	mempool_t *pool;
+	atomic_t allocated;	/* FIXME: debug */
+};
+
+#define BIOVEC_NR_POOLS 6
+struct bio_set {
+	char name[BV_NAME_SIZE];
+	kmem_cache_t *bio_slab;
+	mempool_t *bio_pool;
+	struct biovec_pool pools[BIOVEC_NR_POOLS];
+};
+
+/*----------------*/
+
+static void bio_set_exit(struct bio_set *bs)
+{
+	unsigned i;
+	struct biovec_pool *bp;
+
+	if (bs->bio_pool)
+		mempool_destroy(bs->bio_pool);
+
+	if (bs->bio_slab)
+		kmem_cache_destroy(bs->bio_slab);
+
+	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+		bp = bs->pools + i;
+		if (bp->pool)
+			mempool_destroy(bp->pool);
+
+		if (bp->slab)
+			kmem_cache_destroy(bp->slab);
+	}
+}
+
+/*----------------*/
+
+static void mk_name(char *str, size_t len, const char *prefix, unsigned count)
+{
+	int r;
+
+	r = snprintf(str, len, "%s-%u", prefix, count);
+	if (r < 0)
+		str[len - 1] = '\0';
+}
+
+static int bio_set_init(struct bio_set *bs, const char *slab_prefix,
+			 unsigned pool_entries, unsigned scale)
+{
+	/* FIXME: this must match bvec_index(), why not go the
+	 * whole hog and have a pool per power of 2 ? */
+	static unsigned _vec_lengths[BIOVEC_NR_POOLS] = {
+		1, 4, 16, 64, 128, BIO_MAX_PAGES
+	};
+
+
+	int r;
+	unsigned i, size;
+	struct biovec_pool *bp;
+
+	/* zero the bs so we can tear down properly on error */
+	memset(bs, 0, sizeof(*bs));
+
+	/*
+	 * Set up the bio pool.
+	 */
+	r = snprintf(bs->name, sizeof(bs->name), "%s-bio", slab_prefix);
+	if (r < 0)
+		bs->name[sizeof(bs->name) - 1] = '\0';
+
+	bs->bio_slab = kmem_cache_create(bs->name, sizeof(struct bio), 0,
+					 SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!bs->bio_slab) {
+		DMWARN("can't init bio slab");
+		goto bad;
+	}
+
+	bs->bio_pool = mempool_create(pool_entries, mempool_alloc_slab,
+				      mempool_free_slab, bs->bio_slab);
+	if (!bs->bio_pool) {
+		DMWARN("can't init bio pool");
+		goto bad;
+	}
+
+	/*
+	 * Set up the biovec pools.
+	 */
+	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+		bp = bs->pools + i;
+		bp->nr_vecs = _vec_lengths[i];
+		atomic_set(&bp->allocated, 1); /* FIXME: debug */
+
+
+		size = bp->nr_vecs * sizeof(struct bio_vec);
+
+		mk_name(bp->name, sizeof(bp->name), slab_prefix, i);
+		bp->slab = kmem_cache_create(bp->name, size, 0,
+					     SLAB_HWCACHE_ALIGN, NULL, NULL);
+		if (!bp->slab) {
+			DMWARN("can't init biovec slab cache");
+			goto bad;
+		}
+
+		if (i >= scale)
+			pool_entries >>= 1;
+
+		bp->pool = mempool_create(pool_entries, mempool_alloc_slab,
+					  mempool_free_slab, bp->slab);
+		if (!bp->pool) {
+			DMWARN("can't init biovec mempool");
+			goto bad;
+		}
+	}
+
+	return 0;
+
+ bad:
+	bio_set_exit(bs);
+	return -ENOMEM;
+}
+
+/*----------------*/
+
+/* FIXME: blech */
+static inline unsigned bvec_index(unsigned nr)
+{
+	switch (nr) {
+	case 1:		return 0;
+	case 2 ... 4: 	return 1;
+	case 5 ... 16:	return 2;
+	case 17 ... 64:	return 3;
+	case 65 ... 128:return 4;
+	case 129 ... BIO_MAX_PAGES: return 5;
+	}
+
+	BUG();
+	return 0;
+}
+
+static inline void bs_bio_init(struct bio *bio)
+{
+	bio->bi_next = NULL;
+	bio->bi_flags = 1 << BIO_UPTODATE;
+	bio->bi_rw = 0;
+	bio->bi_vcnt = 0;
+	bio->bi_idx = 0;
+	bio->bi_phys_segments = 0;
+	bio->bi_hw_segments = 0;
+	bio->bi_size = 0;
+	bio->bi_max_vecs = 0;
+	bio->bi_end_io = NULL;
+	atomic_set(&bio->bi_cnt, 1);
+	bio->bi_private = NULL;
+}
+
+static unsigned _bio_count = 0;
+struct bio *bio_set_alloc(struct bio_set *bs, int gfp_mask, int nr_iovecs)
+{
+	struct biovec_pool *bp;
+	struct bio_vec *bv = NULL;
+	unsigned long idx;
+	struct bio *bio;
+
+	bio = mempool_alloc(bs->bio_pool, gfp_mask);
+	if (unlikely(!bio))
+		return NULL;
+
+	bio_init(bio);
+
+	if (likely(nr_iovecs)) {
+		idx = bvec_index(nr_iovecs);
+		bp = bs->pools + idx;
+		bv = mempool_alloc(bp->pool, gfp_mask);
+		if (!bv) {
+			mempool_free(bio, bs->bio_pool);
+			return NULL;
+		}
+
+		memset(bv, 0, bp->nr_vecs * sizeof(*bv));
+		bio->bi_flags |= idx << BIO_POOL_OFFSET;
+		bio->bi_max_vecs = bp->nr_vecs;
+		atomic_inc(&bp->allocated);
+	}
+
+	bio->bi_io_vec = bv;
+	return bio;
+}
+
+static void bio_set_free(struct bio_set *bs, struct bio *bio)
+{
+	struct biovec_pool *bp = bs->pools + BIO_POOL_IDX(bio);
+
+	if (atomic_dec_and_test(&bp->allocated))
+		BUG();
+
+	mempool_free(bio->bi_io_vec, bp->pool);
+	mempool_free(bio, bs->bio_pool);
+}
+
+/*-----------------------------------------------------------------
+ * dm-io proper
+ *---------------------------------------------------------------*/
+static struct bio_set _bios;
+
+/* FIXME: can we shrink this ? */
+struct io {
+	unsigned long error;
+	atomic_t count;
+	struct task_struct *sleeper;
+	io_notify_fn callback;
+	void *context;
+};
+
+/*
+ * io contexts are only dynamically allocated for asynchronous
+ * io.  Since async io is likely to be the majority of io we'll
+ * have the same number of io contexts as buffer heads ! (FIXME:
+ * must reduce this).
+ */
+static unsigned _num_ios;
+static mempool_t *_io_pool;
+
+static void *alloc_io(int gfp_mask, void *pool_data)
+{
+	return kmalloc(sizeof(struct io), gfp_mask);
+}
+
+static void free_io(void *element, void *pool_data)
+{
+	kfree(element);
+}
+
+static unsigned int pages_to_ios(unsigned int pages)
+{
+	return 4 * pages;	/* too many ? */
+}
+
+static int resize_pool(unsigned int new_ios)
+{
+	int r = 0;
+
+	if (_io_pool) {
+		if (new_ios == 0) {
+			/* free off the pool */
+			mempool_destroy(_io_pool);
+			_io_pool = NULL;
+			bio_set_exit(&_bios);
+
+		} else {
+			/* resize the pool */
+			r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
+		}
+
+	} else {
+		/* create new pool */
+		_io_pool = mempool_create(new_ios, alloc_io, free_io, NULL);
+		if (!_io_pool)
+			r = -ENOMEM;
+
+		r = bio_set_init(&_bios, "dm-io", 512, 1);
+		if (r) {
+			mempool_destroy(_io_pool);
+			_io_pool = NULL;
+		}
+	}
+
+	if (!r)
+		_num_ios = new_ios;
+
+	return r;
+}
+
+int dm_io_get(unsigned int num_pages)
+{
+	return resize_pool(_num_ios + pages_to_ios(num_pages));
+}
+
+void dm_io_put(unsigned int num_pages)
+{
+	resize_pool(_num_ios - pages_to_ios(num_pages));
+}
+
+/*-----------------------------------------------------------------
+ * We need to keep track of which region a bio is doing io for.
+ * In order to save a memory allocation we store this the last
+ * bvec which we know is unused (blech).
+ *---------------------------------------------------------------*/
+static inline void bio_set_region(struct bio *bio, unsigned region)
+{
+//	bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region;
+}
+
+static inline unsigned bio_get_region(struct bio *bio)
+{
+//	return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len;
+	return 0;
+}
+
+/*-----------------------------------------------------------------
+ * We need an io object to keep track of the number of bios that
+ * have been dispatched for a particular io.
+ *---------------------------------------------------------------*/
+static void dec_count(struct io *io, unsigned int region, int error)
+{
+	if (error)
+		set_bit(region, &io->error);
+
+	if (atomic_dec_and_test(&io->count)) {
+		if (io->sleeper)
+			wake_up_process(io->sleeper);
+
+		else {
+			int r = io->error;
+			io_notify_fn fn = io->callback;
+			void *context = io->context;
+
+			mempool_free(io, _io_pool);
+			fn(r, context);
+		}
+	}
+}
+
+static int endio(struct bio *bio, unsigned int done, int error)
+{
+	struct io *io = (struct io *) bio->bi_private;
+
+	/* keep going until we've finished */
+	if (bio->bi_size)
+		return 1;
+
+	/* FIXME: kcopyd needs pages zeroing on read failure ???
+	 * sounds like kcopyd is broken */
+	dec_count(io, bio_get_region(bio), error);
+	bio_put(bio);
+
+	return 0;
+}
+
+static void bio_dtr(struct bio *bio)
+{
+	_bio_count--;
+	bio_set_free(&_bios, bio);
+}
+
+/*-----------------------------------------------------------------
+ * These little objects provide an abstraction for getting a new
+ * destination page for io.
+ *---------------------------------------------------------------*/
+struct dpages {
+	void (*get_page)(struct dpages *dp,
+			 struct page **p, unsigned long *len, unsigned *offset);
+	void (*next_page)(struct dpages *dp);
+
+	unsigned context_u;
+	void *context_ptr;
+};
+
+/*
+ * Functions for getting the pages from a list.
+ */
+void list_get_page(struct dpages *dp,
+		  struct page **p, unsigned long *len, unsigned *offset)
+{
+	unsigned o = dp->context_u;
+
+	*p = (struct page *) dp->context_ptr;
+	*len = PAGE_SIZE - o;
+	*offset = o;
+}
+
+void list_next_page(struct dpages *dp)
+{
+	struct page *page = (struct page *) dp->context_ptr;
+	dp->context_ptr = list_entry(page->list.next, struct page, list);
+	dp->context_u = 0;
+}
+
+void list_dp_init(struct dpages *dp, struct page *page, unsigned offset)
+{
+	dp->get_page = list_get_page;
+	dp->next_page = list_next_page;
+	dp->context_u = offset;
+	dp->context_ptr = page;
+}
+
+/*
+ * Functions for getting the pages from a bvec.
+ */
+void bvec_get_page(struct dpages *dp,
+		  struct page **p, unsigned long *len, unsigned *offset)
+{
+	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+	*p = bvec->bv_page;
+	*len = bvec->bv_len;
+	*offset = bvec->bv_offset;
+}
+
+void bvec_next_page(struct dpages *dp)
+{
+	struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+	dp->context_ptr = bvec + 1;
+}
+
+void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
+{
+	dp->context_ptr = bvec;
+}
+
+/*-----------------------------------------------------------------
+ * IO routines that accept a list of pages.
+ *---------------------------------------------------------------*/
+static void do_region(int rw, unsigned int region, struct io_region *where,
+		      struct dpages *dp, struct io *io)
+{
+	struct bio *bio;
+	struct page *page;
+	unsigned long len;
+	unsigned offset;
+	unsigned num_bvecs;
+	sector_t remaining = where->count;
+
+	while (remaining) {
+		/*
+		 * Allocate a suitably sized bio, we add an extra
+		 * bvec for bio_get/set_region().
+		 */
+		num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2;
+		_bio_count++;
+		bio = bio_set_alloc(&_bios, GFP_NOIO, num_bvecs);
+		bio->bi_sector = where->sector + (where->count - remaining);
+		bio->bi_bdev = where->bdev;
+		bio->bi_end_io = endio;
+		bio->bi_private = io;
+		bio->bi_destructor = bio_dtr;
+		bio_set_region(bio, region);
+
+		/*
+		 * Try and add as many pages as possible.
+		 */
+		while (remaining) {
+			dp->get_page(dp, &page, &len, &offset);
+			len = min(len, to_bytes(remaining));
+			if (!bio_add_page(bio, page, len, offset))
+				break;
+
+			offset = 0;
+			remaining -= to_sector(len);
+			dp->next_page(dp);
+		}
+
+		atomic_inc(&io->count);
+		submit_bio(rw, bio);
+	}
+}
+
+static void dispatch_io(int rw, unsigned int num_regions,
+			struct io_region *where, struct dpages *dp,
+			struct io *io)
+{
+	int i;
+
+	for (i = 0; i < num_regions; i++)
+		if (where[i].count)
+			do_region(rw, i, where + i, dp, io);
+
+	/*
+	 * Drop the extra refence that we were holding to avoid
+	 * the io being completed too early.
+	 */
+	dec_count(io, 0, 0);
+}
+
+int sync_io(unsigned int num_regions, struct io_region *where,
+	    int rw, struct dpages *dp, unsigned long *error_bits)
+{
+	struct io io;
+
+	BUG_ON(num_regions > 1 && rw != WRITE);
+
+	io.error = 0;
+	atomic_set(&io.count, 1); /* see dispatch_io() */
+	io.sleeper = current;
+
+	dispatch_io(rw, num_regions, where, dp, &io);
+	blk_run_queues();
+
+	while (1) {
+		/* FIXME: handle signals */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+
+		if (!atomic_read(&io.count))
+			break;
+
+		schedule();
+	}
+	set_current_state(TASK_RUNNING);
+
+	*error_bits = io.error;
+	return io.error ? -EIO : 0;
+}
+
+int async_io(unsigned int num_regions, struct io_region *where, int rw,
+	     struct dpages *dp, io_notify_fn fn, void *context)
+{
+	struct io *io = mempool_alloc(_io_pool, GFP_NOIO);
+
+	io->error = 0;
+	atomic_set(&io->count, 1); /* see dispatch_io() */
+	io->sleeper = NULL;
+	io->callback = fn;
+	io->context = context;
+
+	dispatch_io(rw, num_regions, where, dp, io);
+	return 0;
+}
+
+int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
+	       struct page *pages, unsigned int offset,
+	       unsigned long *error_bits)
+{
+	struct dpages dp;
+	list_dp_init(&dp, pages, offset);
+	return sync_io(num_regions, where, rw, &dp, error_bits);
+}
+
+int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
+		    struct bio_vec *bvec, unsigned long *error_bits)
+{
+	struct dpages dp;
+	bvec_dp_init(&dp, bvec);
+	return sync_io(num_regions, where, rw, &dp, error_bits);
+}
+
+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
+		struct page *pages, unsigned int offset,
+		io_notify_fn fn, void *context)
+{
+	struct dpages dp;
+	list_dp_init(&dp, pages, offset);
+	return async_io(num_regions, where, rw, &dp, fn, context);
+}
+
+int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
+		     struct bio_vec *bvec, io_notify_fn fn, void *context)
+{
+	struct dpages dp;
+	bvec_dp_init(&dp, bvec);
+	return async_io(num_regions, where, rw, &dp, fn, context);
+}
+
+
+EXPORT_SYMBOL(dm_io_get);
+EXPORT_SYMBOL(dm_io_put);
+EXPORT_SYMBOL(dm_io_sync);
+EXPORT_SYMBOL(dm_io_async);
--- diff/drivers/md/dm-io.h	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-io.h	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _DM_IO_H
+#define _DM_IO_H
+
+#include "dm.h"
+
+/* FIXME make this configurable */
+#define DM_MAX_IO_REGIONS 8
+
+struct io_region {
+	struct block_device *bdev;
+	sector_t sector;
+	sector_t count;
+};
+
+
+/*
+ * 'error' is a bitset, with each bit indicating whether an error
+ * occurred doing io to the corresponding region.
+ */
+typedef void (*io_notify_fn)(unsigned long error, void *context);
+
+
+/*
+ * Before anyone uses the IO interface they should call
+ * dm_io_get(), specifying roughly how many pages they are
+ * expecting to perform io on concurrently.
+ *
+ * This function may block.
+ */
+int dm_io_get(unsigned int num_pages);
+void dm_io_put(unsigned int num_pages);
+
+
+/*
+ * Synchronous IO.
+ *
+ * Please ensure that the rw flag in the next two functions is
+ * either READ or WRITE, ie. we don't take READA.  Any
+ * regions with a zero count field will be ignored.
+ */
+int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
+	       struct page *pages, unsigned int offset,
+	       unsigned long *error_bits);
+
+int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
+		    struct bio_vec *bvec, unsigned long *error_bits);
+
+
+/*
+ * Aynchronous IO.
+ *
+ * The 'where' array may be safely allocated on the stack since
+ * the function takes a copy.
+ */
+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
+		struct page *pages, unsigned int offset,
+		io_notify_fn fn, void *context);
+
+int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
+		     struct bio_vec *bvec, io_notify_fn fn, void *context);
+
+#endif
--- diff/drivers/md/dm-log.c	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-log.c	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,307 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "dm-log.h"
+#include "dm-io.h"
+
+static LIST_HEAD(_log_types);
+static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
+
+int dm_register_dirty_log_type(struct dirty_log_type *type)
+{
+	spin_lock(&_lock);
+	type->use_count = 0;
+	try_module_get(type->module);
+
+	list_add(&type->list, &_log_types);
+	spin_unlock(&_lock);
+
+	return 0;
+}
+
+int dm_unregister_dirty_log_type(struct dirty_log_type *type)
+{
+	spin_lock(&_lock);
+
+	if (type->use_count)
+		DMWARN("Attempt to unregister a log type that is still in use");
+	else {
+		list_del(&type->list);
+		module_put(type->module);
+	}
+
+	spin_unlock(&_lock);
+
+	return 0;
+}
+
+static struct dirty_log_type *get_type(const char *type_name)
+{
+	struct dirty_log_type *type;
+	struct list_head *tmp;
+
+	spin_lock(&_lock);
+	list_for_each (tmp, &_log_types) {
+		type = list_entry(tmp, struct dirty_log_type, list);
+		if (!strcmp(type_name, type->name)) {
+			type->use_count++;
+			spin_unlock(&_lock);
+			return type;
+		}
+	}
+
+	spin_unlock(&_lock);
+	return NULL;
+}
+
+static void put_type(struct dirty_log_type *type)
+{
+	spin_lock(&_lock);
+	type->use_count--;
+	spin_unlock(&_lock);
+}
+
+struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
+				      unsigned int argc, char **argv)
+{
+	struct dirty_log_type *type;
+	struct dirty_log *log;
+
+	log = kmalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return NULL;
+
+	type = get_type(type_name);
+	if (!type) {
+		kfree(log);
+		return NULL;
+	}
+
+	log->type = type;
+	if (type->ctr(log, dev_size, argc, argv)) {
+		kfree(log);
+		put_type(type);
+		return NULL;
+	}
+
+	return log;
+}
+
+void dm_destroy_dirty_log(struct dirty_log *log)
+{
+	log->type->dtr(log);
+	put_type(log->type);
+	kfree(log);
+}
+
+
+/*-----------------------------------------------------------------
+ * In core log, ie. trivial, non-persistent
+ *
+ * For now we'll keep this simple and just have 2 bitsets, one
+ * for clean/dirty, the other for sync/nosync.  The sync bitset
+ * will be freed when everything is in sync.
+ *
+ * FIXME: problems with a 64bit sector_t
+ *---------------------------------------------------------------*/
+struct core_log {
+	sector_t region_size;
+	unsigned int region_count;
+	unsigned long *clean_bits;
+	unsigned long *sync_bits;
+	unsigned long *recovering_bits;	/* FIXME: this seems excessive */
+
+	int sync_search;
+};
+
+#define BYTE_SHIFT 3
+static int core_ctr(struct dirty_log *log, sector_t dev_size,
+		    unsigned int argc, char **argv)
+{
+	struct core_log *clog;
+	sector_t region_size;
+	unsigned int region_count;
+	size_t bitset_size;
+
+	if (argc != 1) {
+		DMWARN("wrong number of arguments to core_log");
+		return -EINVAL;
+	}
+
+	if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
+		DMWARN("invalid region size string");
+		return -EINVAL;
+	}
+
+	region_count = dm_div_up(dev_size, region_size);
+
+	clog = kmalloc(sizeof(*clog), GFP_KERNEL);
+	if (!clog) {
+		DMWARN("couldn't allocate core log");
+		return -ENOMEM;
+	}
+
+	clog->region_size = region_size;
+	clog->region_count = region_count;
+
+	/*
+	 * Work out how many words we need to hold the bitset.
+	 */
+	bitset_size = dm_round_up(region_count,
+				  sizeof(*clog->clean_bits) << BYTE_SHIFT);
+	bitset_size >>= BYTE_SHIFT;
+
+	clog->clean_bits = vmalloc(bitset_size);
+	if (!clog->clean_bits) {
+		DMWARN("couldn't allocate clean bitset");
+		kfree(clog);
+		return -ENOMEM;
+	}
+	memset(clog->clean_bits, -1, bitset_size);
+
+	clog->sync_bits = vmalloc(bitset_size);
+	if (!clog->sync_bits) {
+		DMWARN("couldn't allocate sync bitset");
+		vfree(clog->clean_bits);
+		kfree(clog);
+		return -ENOMEM;
+	}
+	memset(clog->sync_bits, 0, bitset_size);
+
+	clog->recovering_bits = vmalloc(bitset_size);
+	if (!clog->recovering_bits) {
+		DMWARN("couldn't allocate sync bitset");
+		vfree(clog->sync_bits);
+		vfree(clog->clean_bits);
+		kfree(clog);
+		return -ENOMEM;
+	}
+	memset(clog->recovering_bits, 0, bitset_size);
+	clog->sync_search = 0;
+	log->context = clog;
+	return 0;
+}
+
+static void core_dtr(struct dirty_log *log)
+{
+	struct core_log *clog = (struct core_log *) log->context;
+	vfree(clog->clean_bits);
+	vfree(clog->sync_bits);
+	vfree(clog->recovering_bits);
+	kfree(clog);
+}
+
+static sector_t core_get_region_size(struct dirty_log *log)
+{
+	struct core_log *clog = (struct core_log *) log->context;
+	return clog->region_size;
+}
+
+static int core_is_clean(struct dirty_log *log, region_t region)
+{
+	struct core_log *clog = (struct core_log *) log->context;
+	return test_bit(region, clog->clean_bits);
+}
+
+static int core_in_sync(struct dirty_log *log, region_t region, int block)
+{
+	struct core_log *clog = (struct core_log *) log->context;
+
+	return test_bit(region, clog->sync_bits) ? 1 : 0;
+}
+
+static int core_flush(struct dirty_log *log)
+{
+	/* no op */
+	return 0;
+}
+
+static void core_mark_region(struct dirty_log *log, region_t region)
+{
+	struct core_log *clog = (struct core_log *) log->context;
+	clear_bit(region, clog->clean_bits);
+}
+
+static void core_clear_region(struct dirty_log *log, region_t region)
+{
+	struct core_log *clog = (struct core_log *) log->context;
+	set_bit(region, clog->clean_bits);
+}
+
+static int core_get_resync_work(struct dirty_log *log, region_t *region)
+{
+	struct core_log *clog = (struct core_log *) log->context;
+
+	if (clog->sync_search >= clog->region_count)
+		return 0;
+
+	do {
+		*region = find_next_zero_bit(clog->sync_bits,
+					     clog->region_count,
+					     clog->sync_search);
+		clog->sync_search = *region + 1;
+
+		if (*region == clog->region_count)
+			return 0;
+
+	} while (test_bit(*region, clog->recovering_bits));
+
+	set_bit(*region, clog->recovering_bits);
+	return 1;
+}
+
+static void core_complete_resync_work(struct dirty_log *log, region_t region,
+				      int success)
+{
+	struct core_log *clog = (struct core_log *) log->context;
+
+	clear_bit(region, clog->recovering_bits);
+	if (success)
+		set_bit(region, clog->sync_bits);
+}
+
+static struct dirty_log_type _core_type = {
+	.name = "core",
+
+	.ctr = core_ctr,
+	.dtr = core_dtr,
+	.get_region_size = core_get_region_size,
+	.is_clean = core_is_clean,
+	.in_sync = core_in_sync,
+	.flush = core_flush,
+	.mark_region = core_mark_region,
+	.clear_region = core_clear_region,
+	.get_resync_work = core_get_resync_work,
+	.complete_resync_work = core_complete_resync_work
+};
+
+__init int dm_dirty_log_init(void)
+{
+	int r;
+
+	r = dm_register_dirty_log_type(&_core_type);
+	if (r)
+		DMWARN("couldn't register core log");
+
+	return r;
+}
+
+void dm_dirty_log_exit(void)
+{
+	dm_unregister_dirty_log_type(&_core_type);
+}
+
+EXPORT_SYMBOL(dm_register_dirty_log_type);
+EXPORT_SYMBOL(dm_unregister_dirty_log_type);
+EXPORT_SYMBOL(dm_dirty_log_init);
+EXPORT_SYMBOL(dm_dirty_log_exit);
+EXPORT_SYMBOL(dm_create_dirty_log);
+EXPORT_SYMBOL(dm_destroy_dirty_log);
--- diff/drivers/md/dm-log.h	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-log.h	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_DIRTY_LOG
+#define DM_DIRTY_LOG
+
+#include "dm.h"
+
+typedef sector_t region_t;
+
+struct dirty_log_type;
+
+struct dirty_log {
+	struct dirty_log_type *type;
+	void *context;
+};
+
+struct dirty_log_type {
+	struct list_head list;
+	const char *name;
+	struct module *module;
+	unsigned int use_count;
+
+	int (*ctr)(struct dirty_log *log, sector_t dev_size,
+		   unsigned int argc, char **argv);
+	void (*dtr)(struct dirty_log *log);
+
+	/*
+	 * Retrieves the smallest size of region that the log can
+	 * deal with.
+	 */
+	sector_t (*get_region_size)(struct dirty_log *log);
+
+        /*
+	 * A predicate to say whether a region is clean or not.
+	 * May block.
+	 */
+	int (*is_clean)(struct dirty_log *log, region_t region);
+
+	/*
+	 *  Returns: 0, 1, -EWOULDBLOCK, < 0
+	 *
+	 * A predicate function to check the area given by
+	 * [sector, sector + len) is in sync.
+	 *
+	 * If -EWOULDBLOCK is returned the state of the region is
+	 * unknown, typically this will result in a read being
+	 * passed to a daemon to deal with, since a daemon is
+	 * allowed to block.
+	 */
+	int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
+
+	/*
+	 * Flush the current log state (eg, to disk).  This
+	 * function may block.
+	 */
+	int (*flush)(struct dirty_log *log);
+
+	/*
+	 * Mark an area as clean or dirty.  These functions may
+	 * block, though for performance reasons blocking should
+	 * be extremely rare (eg, allocating another chunk of
+	 * memory for some reason).
+	 */
+	void (*mark_region)(struct dirty_log *log, region_t region);
+	void (*clear_region)(struct dirty_log *log, region_t region);
+
+	/*
+	 * Returns: <0 (error), 0 (no region), 1 (region)
+	 *
+	 * The mirrord will need perform recovery on regions of
+	 * the mirror that are in the NOSYNC state.  This
+	 * function asks the log to tell the caller about the
+	 * next region that this machine should recover.
+	 *
+	 * Do not confuse this function with 'in_sync()', one
+	 * tells you if an area is synchronised, the other
+	 * assigns recovery work.
+	*/
+	int (*get_resync_work)(struct dirty_log *log, region_t *region);
+
+	/*
+	 * This notifies the log that the resync of an area has
+	 * been completed.  The log should then mark this region
+	 * as CLEAN.
+	 */
+	void (*complete_resync_work)(struct dirty_log *log,
+				     region_t region, int success);
+};
+
+int dm_register_dirty_log_type(struct dirty_log_type *type);
+int dm_unregister_dirty_log_type(struct dirty_log_type *type);
+
+
+/*
+ * Make sure you use these two functions, rather than calling
+ * type->constructor/destructor() directly.
+ */
+struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
+				      unsigned int argc, char **argv);
+void dm_destroy_dirty_log(struct dirty_log *log);
+
+/*
+ * init/exit functions.
+ */
+int dm_dirty_log_init(void);
+void dm_dirty_log_exit(void);
+
+#endif
--- diff/drivers/md/dm-raid1.c	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-raid1.c	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,1300 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-daemon.h"
+#include "dm-io.h"
+#include "dm-log.h"
+#include "kcopyd.h"
+
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/vmalloc.h>
+
+static struct dm_daemon _kmirrord;
+
+/*-----------------------------------------------------------------
+ * buffer lists:
+ *
+ * We play with singly linked lists of bios, but we want to be
+ * careful to add new bios to the back of the list, to avoid
+ * buffers being starved of attention.
+ *---------------------------------------------------------------*/
+struct bio_list {
+	struct bio *head;
+	struct bio *tail;
+};
+
+static inline void bio_list_init(struct bio_list *bl)
+{
+	bl->head = bl->tail = NULL;
+}
+
+static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
+{
+	bio->bi_next = NULL;
+
+	if (bl->tail) {
+		bl->tail->bi_next = bio;
+		bl->tail = bio;
+	} else
+		bl->head = bl->tail = bio;
+}
+
+static struct bio *bio_list_pop(struct bio_list *bl)
+{
+	struct bio *bio = bl->head;
+
+	if (bio) {
+		bl->head = bl->head->bi_next;
+		if (!bl->head)
+			bl->tail = NULL;
+
+		bio->bi_next = NULL;
+	}
+
+	return bio;
+}
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *
+ * The mirror splits itself up into discrete regions.  Each
+ * region can be in one of three states: clean, dirty,
+ * nosync.  There is no need to put clean regions in the hash.
+ *
+ * In addition to being present in the hash table a region _may_
+ * be present on one of three lists.
+ *
+ *   clean_regions: Regions on this list have no io pending to
+ *   them, they are in sync, we are no longer interested in them,
+ *   they are dull.  rh_update_states() will remove them from the
+ *   hash table.
+ *
+ *   quiesced_regions: These regions have been spun down, ready
+ *   for recovery.  rh_recovery_start() will remove regions from
+ *   this list and hand them to kmirrord, which will schedule the
+ *   recovery io with kcopyd.
+ *
+ *   recovered_regions: Regions that kcopyd has successfully
+ *   recovered.  rh_update_states() will now schedule any delayed
+ *   io, up the recovery_count, and remove the region from the
+ *   hash.
+ *
+ * There are 2 locks:
+ *   A rw spin lock 'hash_lock' protects just the hash table,
+ *   this is never held in write mode from interrupt context,
+ *   which I believe means that we only have to disable irqs when
+ *   doing a write lock.
+ *
+ *   An ordinary spin lock 'region_lock' that protects the three
+ *   lists in the region_hash, with the 'state', 'list' and
+ *   'bhs_delayed' fields of the regions.  This is used from irq
+ *   context, so all other uses will have to suspend local irqs.
+ *---------------------------------------------------------------*/
+struct mirror_set;
+struct region_hash {
+	struct mirror_set *ms;
+	sector_t region_size;
+	unsigned region_shift;
+
+	/* holds persistent region state */
+	struct dirty_log *log;
+
+	/* hash table */
+	rwlock_t hash_lock;
+	mempool_t *region_pool;
+	unsigned int mask;
+	unsigned int nr_buckets;
+	struct list_head *buckets;
+
+	spinlock_t region_lock;
+	struct semaphore recovery_count;
+	struct list_head clean_regions;
+	struct list_head quiesced_regions;
+	struct list_head recovered_regions;
+};
+
+enum {
+	RH_CLEAN,
+	RH_DIRTY,
+	RH_NOSYNC,
+	RH_RECOVERING
+};
+
+struct region {
+	struct region_hash *rh;	/* FIXME: can we get rid of this ? */
+	region_t key;
+	int state;
+
+	struct list_head hash_list;
+	struct list_head list;
+
+	atomic_t pending;
+	struct bio *delayed_bios;
+};
+ 
+/*
+ * Conversion fns
+ */
+static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
+{
+	return bio->bi_sector >> rh->region_shift;
+}
+
+static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
+{
+	return region << rh->region_shift;
+}
+
+/* FIXME move this */
+static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
+
+static void *region_alloc(int gfp_mask, void *pool_data)
+{
+	return kmalloc(sizeof(struct region), gfp_mask);
+}
+
+static void region_free(void *element, void *pool_data)
+{
+	kfree(element);
+}
+
+#define MIN_REGIONS 64
+#define MAX_RECOVERY 1
+static int rh_init(struct region_hash *rh, struct mirror_set *ms,
+		   struct dirty_log *log, sector_t region_size,
+		   region_t nr_regions)
+{
+	unsigned int nr_buckets, max_buckets;
+	size_t i;
+
+	/*
+	 * Calculate a suitable number of buckets for our hash
+	 * table.
+	 */
+	max_buckets = nr_regions >> 6;
+	for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
+		;
+	nr_buckets >>= 1;
+
+	rh->ms = ms;
+	rh->log = log;
+	rh->region_size = region_size;
+	rh->region_shift = ffs(region_size);
+	rwlock_init(&rh->hash_lock);
+	rh->mask = nr_buckets - 1;
+	rh->nr_buckets = nr_buckets;
+
+	rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
+	if (!rh->buckets) {
+		DMERR("unable to allocate region hash memory");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_buckets; i++)
+		INIT_LIST_HEAD(rh->buckets + i);
+
+	spin_lock_init(&rh->region_lock);
+	sema_init(&rh->recovery_count, 0);
+	INIT_LIST_HEAD(&rh->clean_regions);
+	INIT_LIST_HEAD(&rh->quiesced_regions);
+	INIT_LIST_HEAD(&rh->recovered_regions);
+
+	rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
+					 region_free, NULL);
+	if (!rh->region_pool) {
+		vfree(rh->buckets);
+		rh->buckets = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void rh_exit(struct region_hash *rh)
+{
+	unsigned int h;
+	struct region *reg;
+	struct list_head *tmp, *tmp2;
+
+	BUG_ON(!list_empty(&rh->quiesced_regions));
+	for (h = 0; h < rh->nr_buckets; h++) {
+		list_for_each_safe (tmp, tmp2, rh->buckets + h) {
+			reg = list_entry(tmp, struct region, hash_list);
+			BUG_ON(atomic_read(&reg->pending));
+			mempool_free(reg, rh->region_pool);
+		}
+	}
+
+	if (rh->log)
+		dm_destroy_dirty_log(rh->log);
+	if (rh->region_pool)
+		mempool_destroy(rh->region_pool);
+	vfree(rh->buckets);
+}
+
+#define RH_HASH_MULT 2654435387U
+
+static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
+{
+	return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
+}
+
+static struct region *__rh_lookup(struct region_hash *rh, region_t region)
+{
+	struct region *reg;
+
+	list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
+		if (reg->key == region)
+			return reg;
+
+	return NULL;
+}
+
+static void __rh_insert(struct region_hash *rh, struct region *reg)
+{
+	unsigned int h = rh_hash(rh, reg->key);
+	list_add(&reg->hash_list, rh->buckets + h);
+}
+
+static struct region *__rh_alloc(struct region_hash *rh, region_t region)
+{
+	struct region *reg, *nreg;
+
+	read_unlock(&rh->hash_lock);
+	nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
+	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
+		RH_CLEAN : RH_NOSYNC;
+	nreg->rh = rh;
+	nreg->key = region;
+
+	INIT_LIST_HEAD(&nreg->list);
+
+	atomic_set(&nreg->pending, 0);
+	nreg->delayed_bios = NULL;
+	write_lock_irq(&rh->hash_lock);
+
+	reg = __rh_lookup(rh, region);
+	if (reg)
+		/* we lost the race */
+		mempool_free(nreg, rh->region_pool);
+
+	else {
+		__rh_insert(rh, nreg);
+		if (nreg->state == RH_CLEAN) {
+			spin_lock_irq(&rh->region_lock);
+			list_add(&nreg->list, &rh->clean_regions);
+			spin_unlock_irq(&rh->region_lock);
+		}
+		reg = nreg;
+	}
+	write_unlock_irq(&rh->hash_lock);
+	read_lock(&rh->hash_lock);
+
+	return reg;
+}
+
+static inline struct region *__rh_find(struct region_hash *rh, region_t region)
+{
+	struct region *reg;
+
+	reg = __rh_lookup(rh, region);
+	if (!reg)
+		reg = __rh_alloc(rh, region);
+
+	return reg;
+}
+
+static int rh_state(struct region_hash *rh, region_t region, int may_block)
+{
+	int r;
+	struct region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	if (reg)
+		return reg->state;
+
+	/*
+	 * The region wasn't in the hash, so we fall back to the
+	 * dirty log.
+	 */
+	r = rh->log->type->in_sync(rh->log, region, may_block);
+
+	/*
+	 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
+	 * taken as a RH_NOSYNC
+	 */
+	return r == 1 ? RH_CLEAN : RH_NOSYNC;
+}
+
+static inline int rh_in_sync(struct region_hash *rh,
+			     region_t region, int may_block)
+{
+	int state = rh_state(rh, region, may_block);
+	return state == RH_CLEAN || state == RH_DIRTY;
+}
+
+static void dispatch_bios(struct mirror_set *ms, struct bio *bio)
+{
+	struct bio *nbio;
+
+	while (bio) {
+		nbio = bio->bi_next;
+		queue_bio(ms, bio, WRITE);
+		bio = nbio;
+	}
+}
+
+static void rh_update_states(struct region_hash *rh)
+{
+	struct list_head *tmp, *tmp2;
+	struct region *reg;
+
+	LIST_HEAD(clean);
+	LIST_HEAD(recovered);
+
+	/*
+	 * Quickly grab the lists.
+	 */
+	write_lock_irq(&rh->hash_lock);
+	spin_lock(&rh->region_lock);
+	if (!list_empty(&rh->clean_regions)) {
+		list_splice(&rh->clean_regions, &clean);
+		INIT_LIST_HEAD(&rh->clean_regions);
+
+		list_for_each_entry (reg, &clean, list) {
+			rh->log->type->clear_region(rh->log, reg->key);
+			list_del(&reg->hash_list);
+		}
+	}
+
+	if (!list_empty(&rh->recovered_regions)) {
+		list_splice(&rh->recovered_regions, &recovered);
+		INIT_LIST_HEAD(&rh->recovered_regions);
+
+		list_for_each_entry (reg, &recovered, list)
+			list_del(&reg->hash_list);
+	}
+	spin_unlock(&rh->region_lock);
+	write_unlock_irq(&rh->hash_lock);
+
+	/*
+	 * All the regions on the recovered and clean lists have
+	 * now been pulled out of the system, so no need to do
+	 * any more locking.
+	 */
+	list_for_each_safe (tmp, tmp2, &recovered) {
+		reg = list_entry(tmp, struct region, list);
+
+		rh->log->type->complete_resync_work(rh->log, reg->key, 1);
+		dispatch_bios(rh->ms, reg->delayed_bios);
+		up(&rh->recovery_count);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	list_for_each_safe (tmp, tmp2, &clean) {
+		reg = list_entry(tmp, struct region, list);
+		mempool_free(reg, rh->region_pool);
+	}
+}
+
+static void rh_inc(struct region_hash *rh, region_t region)
+{
+	struct region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	if (reg->state == RH_CLEAN) {
+		rh->log->type->mark_region(rh->log, reg->key);
+
+		spin_lock_irq(&rh->region_lock);
+		reg->state = RH_DIRTY;
+		list_del_init(&reg->list);	/* take off the clean list */
+		spin_unlock_irq(&rh->region_lock);
+	}
+
+	atomic_inc(&reg->pending);
+	read_unlock(&rh->hash_lock);
+}
+
+static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
+{
+	struct bio *bio;
+
+	for (bio = bios->head; bio; bio = bio->bi_next)
+		rh_inc(rh, bio_to_region(rh, bio));
+}
+
+static void rh_dec(struct region_hash *rh, region_t region)
+{
+	unsigned long flags;
+	struct region *reg;
+	int wake = 0;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	if (atomic_dec_and_test(&reg->pending)) {
+		spin_lock_irqsave(&rh->region_lock, flags);
+		if (reg->state == RH_RECOVERING) {
+			list_add_tail(&reg->list, &rh->quiesced_regions);
+		} else {
+			reg->state = RH_CLEAN;
+			list_add(&reg->list, &rh->clean_regions);
+		}
+		spin_unlock_irqrestore(&rh->region_lock, flags);
+		wake = 1;
+	}
+
+	if (wake)
+		dm_daemon_wake(&_kmirrord);
+}
+
+/*
+ * Starts quiescing a region in preparation for recovery.
+ */
+static int __rh_recovery_prepare(struct region_hash *rh)
+{
+	int r;
+	struct region *reg;
+	region_t region;
+
+	/*
+	 * Ask the dirty log what's next.
+	 */
+	r = rh->log->type->get_resync_work(rh->log, &region);
+	if (r <= 0)
+		return r;
+
+	/*
+	 * Get this region, and start it quiescing by setting the
+	 * recovering flag.
+	 */
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	spin_lock_irq(&rh->region_lock);
+	reg->state = RH_RECOVERING;
+
+	/* Already quiesced ? */
+	if (atomic_read(&reg->pending))
+		list_del_init(&reg->list);
+
+	else {
+		list_del_init(&reg->list);
+		list_add(&reg->list, &rh->quiesced_regions);
+	}
+	spin_unlock_irq(&rh->region_lock);
+
+	return 1;
+}
+
+static void rh_recovery_prepare(struct region_hash *rh)
+{
+	while (!down_trylock(&rh->recovery_count))
+		if (__rh_recovery_prepare(rh) <= 0) {
+			up(&rh->recovery_count);
+			break;
+		}
+}
+
+/*
+ * Returns any quiesced regions.
+ */
+static struct region *rh_recovery_start(struct region_hash *rh)
+{
+	struct region *reg = NULL;
+
+	spin_lock_irq(&rh->region_lock);
+	if (!list_empty(&rh->quiesced_regions)) {
+		reg = list_entry(rh->quiesced_regions.next,
+				 struct region, list);
+		list_del_init(&reg->list);	/* remove from the quiesced list */
+	}
+	spin_unlock_irq(&rh->region_lock);
+
+	return reg;
+}
+
+/* FIXME: success ignored for now */
+static void rh_recovery_end(struct region *reg, int success)
+{
+	struct region_hash *rh = reg->rh;
+
+	spin_lock_irq(&rh->region_lock);
+	list_add(&reg->list, &reg->rh->recovered_regions);
+	spin_unlock_irq(&rh->region_lock);
+
+	dm_daemon_wake(&_kmirrord);
+}
+
+static void rh_flush(struct region_hash *rh)
+{
+	rh->log->type->flush(rh->log);
+}
+
+static void rh_delay(struct region_hash *rh, struct bio *bio)
+{
+	struct region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, bio_to_region(rh, bio));
+	bio->bi_next = reg->delayed_bios;
+	reg->delayed_bios = bio;
+	read_unlock(&rh->hash_lock);
+}
+
+static void rh_stop_recovery(struct region_hash *rh)
+{
+	int i;
+
+	/* wait for any recovering regions */
+	for (i = 0; i < MAX_RECOVERY; i++)
+		down(&rh->recovery_count);
+}
+
+static void rh_start_recovery(struct region_hash *rh)
+{
+	int i;
+
+	for (i = 0; i < MAX_RECOVERY; i++)
+		up(&rh->recovery_count);
+
+	dm_daemon_wake(&_kmirrord);
+}
+
+/*-----------------------------------------------------------------
+ * Mirror set structures.
+ *---------------------------------------------------------------*/
+struct mirror {
+	atomic_t error_count;
+	struct dm_dev *dev;
+	sector_t offset;
+};
+
+struct mirror_set {
+	struct dm_target *ti;
+	struct list_head list;
+	struct region_hash rh;
+	struct kcopyd_client *kcopyd_client;
+
+	spinlock_t lock;	/* protects the next two lists */
+	struct bio_list reads;
+	struct bio_list writes;
+
+	/* recovery */
+	region_t nr_regions;
+	region_t sync_count;
+
+	unsigned int nr_mirrors;
+	struct mirror mirror[0];
+};
+
+/*
+ * Every mirror should look like this one.
+ */
+#define DEFAULT_MIRROR 0
+
+/*
+ * This is yucky.  We squirrel the mirror_set struct away inside
+ * bi_next for write buffers.  This is safe since the bh
+ * doesn't get submitted to the lower levels of block layer.
+ */
+static struct mirror_set *bio_get_ms(struct bio *bio)
+{
+	return (struct mirror_set *) bio->bi_next;
+}
+
+static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
+{
+	bio->bi_next = (struct bio *) ms;
+}
+
+/*-----------------------------------------------------------------
+ * Recovery.
+ *
+ * When a mirror is first activated we may find that some regions
+ * are in the no-sync state.  We have to recover these by
+ * recopying from the default mirror to all the others.
+ *---------------------------------------------------------------*/
+static void recovery_complete(int read_err, unsigned int write_err,
+			      void *context)
+{
+	struct region *reg = (struct region *) context;
+	struct mirror_set *ms = reg->rh->ms;
+
+	/* FIXME: better error handling */
+	rh_recovery_end(reg, read_err || write_err);
+	if (++ms->sync_count == ms->nr_regions)
+		/* the sync is complete */
+		dm_table_event(ms->ti->table);
+}
+
+static int recover(struct mirror_set *ms, struct region *reg)
+{
+	int r;
+	unsigned int i;
+	struct io_region from, to[ms->nr_mirrors - 1], *dest;
+	struct mirror *m;
+	unsigned long flags = 0;
+
+	/* fill in the source */
+	m = ms->mirror + DEFAULT_MIRROR;
+	from.bdev = m->dev->bdev;
+	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
+	if (reg->key == (ms->nr_regions - 1)) {
+		/*
+		 * The final region may be smaller than
+		 * region_size.
+		 */
+		from.count = ms->ti->len & (reg->rh->region_size - 1);
+		if (!from.count)
+			from.count = reg->rh->region_size;
+	} else
+		from.count = reg->rh->region_size;
+
+	/* fill in the destinations */
+	for (i = 1; i < ms->nr_mirrors; i++) {
+		m = ms->mirror + i;
+		dest = to + (i - 1);
+
+		dest->bdev = m->dev->bdev;
+		dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
+		dest->count = from.count;
+	}
+
+	/* hand to kcopyd */
+	set_bit(KCOPYD_IGNORE_ERROR, &flags);
+	r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
+			recovery_complete, reg);
+
+	return r;
+}
+
+static void do_recovery(struct mirror_set *ms)
+{
+	int r;
+	struct region *reg;
+
+	/*
+	 * Start quiescing some regions.
+	 */
+	rh_recovery_prepare(&ms->rh);
+
+	/*
+	 * Copy any already quiesced regions.
+	 */
+	while ((reg = rh_recovery_start(&ms->rh))) {
+		r = recover(ms, reg);
+		if (r)
+			rh_recovery_end(reg, 0);
+	}
+}
+
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
+{
+	/* FIXME: add read balancing */
+	return ms->mirror + DEFAULT_MIRROR;
+}
+
+/*
+ * remap a buffer to a particular mirror.
+ */
+static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
+{
+	bio->bi_bdev = m->dev->bdev;
+	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
+}
+
+static void do_reads(struct mirror_set *ms, struct bio_list *reads)
+{
+	region_t region;
+	struct bio *bio;
+	struct mirror *m;
+
+	while ((bio = bio_list_pop(reads))) {
+		region = bio_to_region(&ms->rh, bio);
+
+		/*
+		 * We can only read balance if the region is in sync.
+		 */
+		if (rh_in_sync(&ms->rh, region, 0))
+			m = choose_mirror(ms, bio->bi_sector);
+		else
+			m = ms->mirror + DEFAULT_MIRROR;
+
+		map_bio(ms, m, bio);
+		generic_make_request(bio);
+	}
+}
+
+/*-----------------------------------------------------------------
+ * Writes.
+ *
+ * We do different things with the write io depending on the
+ * state of the region that it's in:
+ *
+ * SYNC: 	increment pending, use kcopyd to write to *all* mirrors
+ * RECOVERING:	delay the io until recovery completes
+ * NOSYNC:	increment pending, just write to the default mirror
+ *---------------------------------------------------------------*/
+static void write_callback(unsigned long error, void *context)
+{
+	unsigned int i;
+	int uptodate = 1;
+	struct bio *bio = (struct bio *) context;
+	struct mirror_set *ms;
+
+	ms = bio_get_ms(bio);
+	bio_set_ms(bio, NULL);
+
+	/*
+	 * NOTE: We don't decrement the pending count here,
+	 * instead it is done by the targets endio function.
+	 * This way we handle both writes to SYNC and NOSYNC
+	 * regions with the same code.
+	 */
+
+	if (error) {
+		/*
+		 * only error the io if all mirrors failed.
+		 * FIXME: bogus
+		 */
+		uptodate = 0;
+		for (i = 0; i < ms->nr_mirrors; i++)
+			if (!test_bit(i, &error)) {
+				uptodate = 1;
+				break;
+			}
+	}
+	bio_endio(bio, bio->bi_size, 0);
+}
+
+static void do_write(struct mirror_set *ms, struct bio *bio)
+{
+	unsigned int i;
+	struct io_region io[ms->nr_mirrors];
+	struct mirror *m;
+
+	for (i = 0; i < ms->nr_mirrors; i++) {
+		m = ms->mirror + i;
+
+		io[i].bdev = m->dev->bdev;
+		io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
+		io[i].count = bio->bi_size >> 9;
+	}
+
+	bio_set_ms(bio, ms);
+	dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
+			 bio->bi_io_vec + bio->bi_idx,
+			 write_callback, bio);
+}
+
+static void do_writes(struct mirror_set *ms, struct bio_list *writes)
+{
+	int state;
+	struct bio *bio;
+	struct bio_list sync, nosync, recover, *this_list = NULL;
+
+	if (!writes->head)
+		return;
+
+	/*
+	 * Classify each write.
+	 */
+	bio_list_init(&sync);
+	bio_list_init(&nosync);
+	bio_list_init(&recover);
+
+	while ((bio = bio_list_pop(writes))) {
+		state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
+		switch (state) {
+		case RH_CLEAN:
+		case RH_DIRTY:
+			this_list = &sync;
+			break;
+
+		case RH_NOSYNC:
+			this_list = &nosync;
+			break;
+
+		case RH_RECOVERING:
+			this_list = &recover;
+			break;
+		}
+
+		bio_list_add(this_list, bio);
+	}
+
+	/*
+	 * Increment the pending counts for any regions that will
+	 * be written to (writes to recover regions are going to
+	 * be delayed).
+	 */
+	rh_inc_pending(&ms->rh, &sync);
+	rh_inc_pending(&ms->rh, &nosync);
+	rh_flush(&ms->rh);
+
+	/*
+	 * Dispatch io.
+	 */
+	while ((bio = bio_list_pop(&sync)))
+		do_write(ms, bio);
+
+	while ((bio = bio_list_pop(&recover)))
+		rh_delay(&ms->rh, bio);
+
+	while ((bio = bio_list_pop(&nosync))) {
+		map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
+		generic_make_request(bio);
+	}
+}
+
+/*-----------------------------------------------------------------
+ * kmirrord
+ *---------------------------------------------------------------*/
+static LIST_HEAD(_mirror_sets);
+static DECLARE_RWSEM(_mirror_sets_lock);
+
+static void do_mirror(struct mirror_set *ms)
+{
+	struct bio_list reads, writes;
+
+	spin_lock(&ms->lock);
+	memcpy(&reads, &ms->reads, sizeof(reads));
+	bio_list_init(&ms->reads);
+	memcpy(&writes, &ms->writes, sizeof(writes));
+	bio_list_init(&ms->writes);
+	spin_unlock(&ms->lock);
+
+	rh_update_states(&ms->rh);
+	do_recovery(ms);
+	do_reads(ms, &reads);
+	do_writes(ms, &writes);
+	blk_run_queues();
+}
+
+static void do_work(void)
+{
+	struct mirror_set *ms;
+
+	down_read(&_mirror_sets_lock);
+	list_for_each_entry (ms, &_mirror_sets, list)
+		do_mirror(ms);
+	up_read(&_mirror_sets_lock);
+}
+
+/*-----------------------------------------------------------------
+ * Target functions
+ *---------------------------------------------------------------*/
+static struct mirror_set *alloc_context(unsigned int nr_mirrors,
+					sector_t region_size,
+					struct dm_target *ti,
+					struct dirty_log *dl)
+{
+	size_t len;
+	struct mirror_set *ms = NULL;
+
+	if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
+		return NULL;
+
+	len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
+
+	ms = kmalloc(len, GFP_KERNEL);
+	if (!ms) {
+		ti->error = "dm-mirror: Cannot allocate mirror context";
+		return NULL;
+	}
+
+	memset(ms, 0, len);
+	spin_lock_init(&ms->lock);
+
+	ms->ti = ti;
+	ms->nr_mirrors = nr_mirrors;
+	ms->nr_regions = dm_div_up(ti->len, region_size);
+
+	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
+		ti->error = "dm-mirror: Error creating dirty region hash";
+		kfree(ms);
+		return NULL;
+	}
+
+	return ms;
+}
+
+static void free_context(struct mirror_set *ms, struct dm_target *ti,
+			 unsigned int m)
+{
+	while (m--)
+		dm_put_device(ti, ms->mirror[m].dev);
+
+	rh_exit(&ms->rh);
+	kfree(ms);
+}
+
+static inline int _check_region_size(struct dm_target *ti, sector_t size)
+{
+	return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
+		 size > ti->len);
+}
+
+static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
+		      unsigned int mirror, char **argv)
+{
+	sector_t offset;
+
+	if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
+		ti->error = "dm-mirror: Invalid offset";
+		return -EINVAL;
+	}
+
+	if (dm_get_device(ti, argv[0], offset, ti->len,
+			  dm_table_get_mode(ti->table),
+			  &ms->mirror[mirror].dev)) {
+		ti->error = "dm-mirror: Device lookup failure";
+		return -ENXIO;
+	}
+
+	ms->mirror[mirror].offset = offset;
+
+	return 0;
+}
+
+static int add_mirror_set(struct mirror_set *ms)
+{
+	down_write(&_mirror_sets_lock);
+	list_add_tail(&ms->list, &_mirror_sets);
+	up_write(&_mirror_sets_lock);
+	dm_daemon_wake(&_kmirrord);
+
+	return 0;
+}
+
+static void del_mirror_set(struct mirror_set *ms)
+{
+	down_write(&_mirror_sets_lock);
+	list_del(&ms->list);
+	up_write(&_mirror_sets_lock);
+}
+
+/*
+ * Create dirty log: log_type #log_params <log_params>
+ */
+static struct dirty_log *create_dirty_log(struct dm_target *ti,
+					  unsigned int argc, char **argv,
+					  unsigned int *args_used)
+{
+	unsigned int param_count;
+	struct dirty_log *dl;
+
+	if (argc < 2) {
+		ti->error = "dm-mirror: Insufficient mirror log arguments";
+		return NULL;
+	}
+
+	if (sscanf(argv[1], "%u", &param_count) != 1 || param_count != 1) {
+		ti->error = "dm-mirror: Invalid mirror log argument count";
+		return NULL;
+	}
+
+	*args_used = 2 + param_count;
+
+	if (argc < *args_used) {
+		ti->error = "dm-mirror: Insufficient mirror log arguments";
+		return NULL;
+	}
+
+	dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2);
+	if (!dl) {
+		ti->error = "dm-mirror: Error creating mirror dirty log";
+		return NULL;
+	}
+
+	if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
+		ti->error = "dm-mirror: Invalid region size";
+		dm_destroy_dirty_log(dl);
+		return NULL;
+	}
+
+	return dl;
+}
+
+/*
+ * Construct a mirror mapping:
+ *
+ * log_type #log_params <log_params>
+ * #mirrors [mirror_path offset]{2,}
+ *
+ * For now, #log_params = 1, log_type = "core"
+ *
+ */
+#define DM_IO_PAGES 64
+static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	int r;
+	unsigned int nr_mirrors, m, args_used;
+	struct mirror_set *ms;
+	struct dirty_log *dl;
+
+	dl = create_dirty_log(ti, argc, argv, &args_used);
+	if (!dl)
+		return -EINVAL;
+
+	argv += args_used;
+	argc -= args_used;
+
+	if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
+	    nr_mirrors < 2) {
+		ti->error = "dm-mirror: Invalid number of mirrors";
+		dm_destroy_dirty_log(dl);
+		return -EINVAL;
+	}
+
+	argv++, argc--;
+
+	if (argc != nr_mirrors * 2) {
+		ti->error = "dm-mirror: Wrong number of mirror arguments";
+		dm_destroy_dirty_log(dl);
+		return -EINVAL;
+	}
+
+	ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
+	if (!ms) {
+		dm_destroy_dirty_log(dl);
+		return -ENOMEM;
+	}
+
+	/* Get the mirror parameter sets */
+	for (m = 0; m < nr_mirrors; m++) {
+		r = get_mirror(ms, ti, m, argv);
+		if (r) {
+			free_context(ms, ti, m);
+			return r;
+		}
+		argv += 2;
+		argc -= 2;
+	}
+
+	ti->private = ms;
+
+	r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
+	if (r) {
+		free_context(ms, ti, ms->nr_mirrors);
+		return r;
+	}
+
+	add_mirror_set(ms);
+	return 0;
+}
+
+static void mirror_dtr(struct dm_target *ti)
+{
+	struct mirror_set *ms = (struct mirror_set *) ti->private;
+
+	del_mirror_set(ms);
+	kcopyd_client_destroy(ms->kcopyd_client);
+	free_context(ms, ti, ms->nr_mirrors);
+}
+
+static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
+{
+	int wake = 0;
+	struct bio_list *bl;
+
+	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
+	spin_lock(&ms->lock);
+	wake = !(bl->head);
+	bio_list_add(bl, bio);
+	spin_unlock(&ms->lock);
+
+	if (wake)
+		dm_daemon_wake(&_kmirrord);
+}
+
+/*
+ * Mirror mapping function
+ */
+static int mirror_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	int r, rw = bio_rw(bio);
+	struct mirror *m;
+	struct mirror_set *ms = ti->private;
+
+	map_context->ll = bio->bi_sector >> ms->rh.region_shift;
+
+	if (rw == WRITE) {
+		queue_bio(ms, bio, rw);
+		return 0;
+	}
+
+	r = ms->rh.log->type->in_sync(ms->rh.log,
+				      bio_to_region(&ms->rh, bio), 0);
+	if (r < 0 && r != -EWOULDBLOCK)
+		return r;
+
+	if (r == -EWOULDBLOCK)	/* FIXME: ugly */
+		r = 0;
+
+	/*
+	 * We don't want to fast track a recovery just for a read
+	 * ahead.  So we just let it silently fail.
+	 * FIXME: get rid of this.
+	 */
+	if (!r && rw == READA)
+		return -EIO;
+
+	if (!r) {
+		/* Pass this io over to the daemon */
+		queue_bio(ms, bio, rw);
+		return 0;
+	}
+
+	m = choose_mirror(ms, bio->bi_sector);
+	if (!m)
+		return -EIO;
+
+	map_bio(ms, m, bio);
+	return 1;
+}
+
+static int mirror_end_io(struct dm_target *ti, struct bio *bio,
+			 int error, union map_info *map_context)
+{
+	int rw = bio_rw(bio);
+	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	region_t region = map_context->ll;
+
+	/*
+	 * We need to dec pending if this was a write.
+	 */
+	if (rw == WRITE)
+		rh_dec(&ms->rh, region);
+
+	return 0;
+}
+
+static void mirror_suspend(struct dm_target *ti)
+{
+	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	rh_stop_recovery(&ms->rh);
+}
+
+static void mirror_resume(struct dm_target *ti)
+{
+	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	rh_start_recovery(&ms->rh);
+}
+
+static int mirror_status(struct dm_target *ti, status_type_t type,
+			 char *result, unsigned int maxlen)
+{
+	char buffer[32];
+	unsigned int m, sz = 0;
+	struct mirror_set *ms = (struct mirror_set *) ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors);
+
+		for (m = 0; m < ms->nr_mirrors; m++) {
+			format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev);
+			sz += snprintf(result + sz, maxlen - sz, "%s ", buffer);
+		}
+
+		sz += snprintf(result + sz, maxlen - sz,
+			       SECTOR_FORMAT "/" SECTOR_FORMAT,
+			       ms->sync_count, ms->nr_regions);
+		break;
+
+	case STATUSTYPE_TABLE:
+		sz += snprintf(result + sz, maxlen - sz,
+			       "%s 1 " SECTOR_FORMAT " %d ",
+			       ms->rh.log->type->name, ms->rh.region_size,
+			       ms->nr_mirrors);
+
+		for (m = 0; m < ms->nr_mirrors; m++) {
+			format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev);
+			sz += snprintf(result + sz, maxlen - sz,
+				       "%s " SECTOR_FORMAT " ",
+				       buffer, ms->mirror[m].offset);
+		}
+	}
+
+	return 0;
+}
+
+static struct target_type mirror_target = {
+	.name	 = "mirror",
+	.module	 = THIS_MODULE,
+	.ctr	 = mirror_ctr,
+	.dtr	 = mirror_dtr,
+	.map	 = mirror_map,
+	.end_io	 = mirror_end_io,
+	.suspend = mirror_suspend,
+	.resume	 = mirror_resume,
+	.status	 = mirror_status,
+};
+
+static int __init dm_mirror_init(void)
+{
+	int r;
+
+	r = dm_dirty_log_init();
+	if (r)
+		return r;
+
+	r = dm_daemon_start(&_kmirrord, "kmirrord", do_work);
+	if (r) {
+		DMERR("couldn't start kmirrord");
+		dm_dirty_log_exit();
+		return r;
+	}
+
+	r = dm_register_target(&mirror_target);
+	if (r < 0) {
+		DMERR("%s: Failed to register mirror target",
+		      mirror_target.name);
+		dm_dirty_log_exit();
+		dm_daemon_stop(&_kmirrord);
+	}
+
+	return r;
+}
+
+static void __exit dm_mirror_exit(void)
+{
+	int r;
+
+	r = dm_unregister_target(&mirror_target);
+	if (r < 0)
+		DMERR("%s: unregister failed %d", mirror_target.name, r);
+
+	dm_daemon_stop(&_kmirrord);
+	dm_dirty_log_exit();
+}
+
+/* Module hooks */
+module_init(dm_mirror_init);
+module_exit(dm_mirror_exit);
+
+MODULE_DESCRIPTION(DM_NAME " mirror target");
+MODULE_AUTHOR("Joe Thornber");
+MODULE_LICENSE("GPL");
--- diff/drivers/md/dm-snapshot.c	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-snapshot.c	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,1269 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/config.h>
+#include <linux/ctype.h>
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "dm-snapshot.h"
+#include "kcopyd.h"
+
+/*
+ * FIXME: Remove this before release.
+ */
+#if 0
+#define DMDEBUG DMWARN
+#else
+#define DMDEBUG(x...)
+#endif
+
+/*
+ * The percentage increment we will wake up users at
+ */
+#define WAKE_UP_PERCENT 5
+
+/*
+ * kcopyd priority of snapshot operations
+ */
+#define SNAPSHOT_COPY_PRIORITY 2
+
+/*
+ * Each snapshot reserves this many pages for io
+ * FIXME: calculate this
+ */
+#define SNAPSHOT_PAGES 256
+
+struct pending_exception {
+	struct exception e;
+
+	/*
+	 * Origin buffers waiting for this to complete are held
+	 * in a list (using b_reqnext).
+	 */
+	struct bio *origin_bios;
+	struct bio *snapshot_bios;
+
+	/*
+	 * Other pending_exceptions that are processing this
+	 * chunk.  When this list is empty, we know we can
+	 * complete the origins.
+	 */
+	struct list_head siblings;
+
+	/* Pointer back to snapshot context */
+	struct dm_snapshot *snap;
+
+	/*
+	 * 1 indicates the exception has already been sent to
+	 * kcopyd.
+	 */
+	int started;
+};
+
+/*
+ * Hash table mapping origin volumes to lists of snapshots and
+ * a lock to protect it
+ */
+static kmem_cache_t *exception_cache;
+static kmem_cache_t *pending_cache;
+static mempool_t *pending_pool;
+
+/*
+ * One of these per registered origin, held in the snapshot_origins hash
+ */
+struct origin {
+	/* The origin device */
+	struct block_device *bdev;
+
+	struct list_head hash_list;
+
+	/* List of snapshots for this origin */
+	struct list_head snapshots;
+};
+
+/*
+ * Size of the hash table for origin volumes. If we make this
+ * the size of the minors list then it should be nearly perfect
+ */
+#define ORIGIN_HASH_SIZE 256
+#define ORIGIN_MASK      0xFF
+static struct list_head *_origins;
+static struct rw_semaphore _origins_lock;
+
+static int init_origin_hash(void)
+{
+	int i;
+
+	_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+			   GFP_KERNEL);
+	if (!_origins) {
+		DMERR("Device mapper: Snapshot: unable to allocate memory");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
+		INIT_LIST_HEAD(_origins + i);
+	init_rwsem(&_origins_lock);
+
+	return 0;
+}
+
+static void exit_origin_hash(void)
+{
+	kfree(_origins);
+}
+
+static inline unsigned int origin_hash(struct block_device *bdev)
+{
+	return bdev->bd_dev & ORIGIN_MASK;
+}
+
+static struct origin *__lookup_origin(struct block_device *origin)
+{
+	struct list_head *slist;
+	struct list_head *ol;
+	struct origin *o;
+
+	ol = &_origins[origin_hash(origin)];
+	list_for_each(slist, ol) {
+		o = list_entry(slist, struct origin, hash_list);
+
+		if (bdev_equal(o->bdev, origin))
+			return o;
+	}
+
+	return NULL;
+}
+
+static void __insert_origin(struct origin *o)
+{
+	struct list_head *sl = &_origins[origin_hash(o->bdev)];
+	list_add_tail(&o->hash_list, sl);
+}
+
+/*
+ * Make a note of the snapshot and its origin so we can look it
+ * up when the origin has a write on it.
+ */
+static int register_snapshot(struct dm_snapshot *snap)
+{
+	struct origin *o;
+	struct block_device *bdev = snap->origin->bdev;
+
+	down_write(&_origins_lock);
+	o = __lookup_origin(bdev);
+
+	if (!o) {
+		/* New origin */
+		o = kmalloc(sizeof(*o), GFP_KERNEL);
+		if (!o) {
+			up_write(&_origins_lock);
+			return -ENOMEM;
+		}
+
+		/* Initialise the struct */
+		INIT_LIST_HEAD(&o->snapshots);
+		o->bdev = bdev;
+
+		__insert_origin(o);
+	}
+
+	list_add_tail(&snap->list, &o->snapshots);
+
+	up_write(&_origins_lock);
+	return 0;
+}
+
+static void unregister_snapshot(struct dm_snapshot *s)
+{
+	struct origin *o;
+
+	down_write(&_origins_lock);
+	o = __lookup_origin(s->origin->bdev);
+
+	list_del(&s->list);
+	if (list_empty(&o->snapshots)) {
+		list_del(&o->hash_list);
+		kfree(o);
+	}
+
+	up_write(&_origins_lock);
+}
+
+/*
+ * Implementation of the exception hash tables.
+ */
+static int init_exception_table(struct exception_table *et, uint32_t size)
+{
+	unsigned int i;
+
+	et->hash_mask = size - 1;
+	et->table = dm_vcalloc(size, sizeof(struct list_head));
+	if (!et->table)
+		return -ENOMEM;
+
+	for (i = 0; i < size; i++)
+		INIT_LIST_HEAD(et->table + i);
+
+	return 0;
+}
+
+static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
+{
+	struct list_head *slot, *entry, *temp;
+	struct exception *ex;
+	int i, size;
+
+	size = et->hash_mask + 1;
+	for (i = 0; i < size; i++) {
+		slot = et->table + i;
+
+		list_for_each_safe(entry, temp, slot) {
+			ex = list_entry(entry, struct exception, hash_list);
+			kmem_cache_free(mem, ex);
+		}
+	}
+
+	vfree(et->table);
+}
+
+/*
+ * FIXME: check how this hash fn is performing.
+ */
+static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
+{
+	return chunk & et->hash_mask;
+}
+
+static void insert_exception(struct exception_table *eh, struct exception *e)
+{
+	struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
+	list_add(&e->hash_list, l);
+}
+
+static inline void remove_exception(struct exception *e)
+{
+	list_del(&e->hash_list);
+}
+
+/*
+ * Return the exception data for a sector, or NULL if not
+ * remapped.
+ */
+static struct exception *lookup_exception(struct exception_table *et,
+					  chunk_t chunk)
+{
+	struct list_head *slot, *el;
+	struct exception *e;
+
+	slot = &et->table[exception_hash(et, chunk)];
+	list_for_each(el, slot) {
+		e = list_entry(el, struct exception, hash_list);
+		if (e->old_chunk == chunk)
+			return e;
+	}
+
+	return NULL;
+}
+
+static inline struct exception *alloc_exception(void)
+{
+	struct exception *e;
+
+	e = kmem_cache_alloc(exception_cache, GFP_NOIO);
+	if (!e)
+		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
+
+	return e;
+}
+
+static inline void free_exception(struct exception *e)
+{
+	kmem_cache_free(exception_cache, e);
+}
+
+static inline struct pending_exception *alloc_pending_exception(void)
+{
+	return mempool_alloc(pending_pool, GFP_NOIO);
+}
+
+static inline void free_pending_exception(struct pending_exception *pe)
+{
+	mempool_free(pe, pending_pool);
+}
+
+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
+{
+	struct exception *e;
+
+	e = alloc_exception();
+	if (!e)
+		return -ENOMEM;
+
+	e->old_chunk = old;
+	e->new_chunk = new;
+	insert_exception(&s->complete, e);
+	return 0;
+}
+
+/*
+ * Hard coded magic.
+ */
+static int calc_max_buckets(void)
+{
+	unsigned long mem;
+
+	mem = num_physpages << PAGE_SHIFT;
+	mem /= 50;
+	mem /= sizeof(struct list_head);
+
+	return mem;
+}
+
+/*
+ * Rounds a number down to a power of 2.
+ */
+static inline uint32_t round_down(uint32_t n)
+{
+	while (n & (n - 1))
+		n &= (n - 1);
+	return n;
+}
+
+/*
+ * Allocate room for a suitable hash table.
+ */
+static int init_hash_tables(struct dm_snapshot *s)
+{
+	sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
+
+	/*
+	 * Calculate based on the size of the original volume or
+	 * the COW volume...
+	 */
+	cow_dev_size = get_dev_size(s->cow->bdev);
+	origin_dev_size = get_dev_size(s->origin->bdev);
+	max_buckets = calc_max_buckets();
+
+	hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
+	hash_size = min(hash_size, max_buckets);
+
+	/* Round it down to a power of 2 */
+	hash_size = round_down(hash_size);
+	if (init_exception_table(&s->complete, hash_size))
+		return -ENOMEM;
+
+	/*
+	 * Allocate hash table for in-flight exceptions
+	 * Make this smaller than the real hash table
+	 */
+	hash_size >>= 3;
+	if (!hash_size)
+		hash_size = 64;
+
+	if (init_exception_table(&s->pending, hash_size)) {
+		exit_exception_table(&s->complete, exception_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Round a number up to the nearest 'size' boundary.  size must
+ * be a power of 2.
+ */
+static inline ulong round_up(ulong n, ulong size)
+{
+	size--;
+	return (n + size) & ~size;
+}
+
+/*
+ * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
+ */
+static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct dm_snapshot *s;
+	unsigned long chunk_size;
+	int r = -EINVAL;
+	char persistent;
+	char *origin_path;
+	char *cow_path;
+	char *value;
+	int blocksize;
+
+	if (argc < 4) {
+		ti->error = "dm-snapshot: requires exactly 4 arguments";
+		r = -EINVAL;
+		goto bad1;
+	}
+
+	origin_path = argv[0];
+	cow_path = argv[1];
+	persistent = toupper(*argv[2]);
+
+	if (persistent != 'P' && persistent != 'N') {
+		ti->error = "Persistent flag is not P or N";
+		r = -EINVAL;
+		goto bad1;
+	}
+
+	chunk_size = simple_strtoul(argv[3], &value, 10);
+	if (chunk_size == 0 || value == NULL) {
+		ti->error = "Invalid chunk size";
+		r = -EINVAL;
+		goto bad1;
+	}
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL) {
+		ti->error = "Cannot allocate snapshot context private "
+		    "structure";
+		r = -ENOMEM;
+		goto bad1;
+	}
+
+	r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
+	if (r) {
+		ti->error = "Cannot get origin device";
+		goto bad2;
+	}
+
+	/* FIXME: get cow length */
+	r = dm_get_device(ti, cow_path, 0, 0,
+			  FMODE_READ | FMODE_WRITE, &s->cow);
+	if (r) {
+		dm_put_device(ti, s->origin);
+		ti->error = "Cannot get COW device";
+		goto bad2;
+	}
+
+	/*
+	 * Chunk size must be multiple of page size.  Silently
+	 * round up if it's not.
+	 */
+	chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
+
+	/* Validate the chunk size against the device block size */
+	/* FIXME: check this, also ugly */
+	blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
+	if (chunk_size % (blocksize >> 9)) {
+		ti->error = "Chunk size is not a multiple of device blocksize";
+		r = -EINVAL;
+		goto bad3;
+	}
+
+	/* Check chunk_size is a power of 2 */
+	if (chunk_size & (chunk_size - 1)) {
+		ti->error = "Chunk size is not a power of 2";
+		r = -EINVAL;
+		goto bad3;
+	}
+
+	s->chunk_size = chunk_size;
+	s->chunk_mask = chunk_size - 1;
+	s->type = persistent;
+	for (s->chunk_shift = 0; chunk_size;
+	     s->chunk_shift++, chunk_size >>= 1)
+		;
+	s->chunk_shift--;
+
+	s->valid = 1;
+	s->have_metadata = 0;
+	s->last_percent = 0;
+	init_rwsem(&s->lock);
+	s->table = ti->table;
+
+	/* Allocate hash table for COW data */
+	if (init_hash_tables(s)) {
+		ti->error = "Unable to allocate hash table space";
+		r = -ENOMEM;
+		goto bad3;
+	}
+
+	/*
+	 * Check the persistent flag - done here because we need the iobuf
+	 * to check the LV header
+	 */
+	s->store.snap = s;
+
+	if (persistent == 'P')
+		r = dm_create_persistent(&s->store, s->chunk_size);
+	else
+		r = dm_create_transient(&s->store, s, blocksize);
+
+	if (r) {
+		ti->error = "Couldn't create exception store";
+		r = -EINVAL;
+		goto bad4;
+	}
+
+	r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
+	if (r) {
+		ti->error = "Could not create kcopyd client";
+		goto bad5;
+	}
+
+	/* Add snapshot to the list of snapshots for this origin */
+	if (register_snapshot(s)) {
+		r = -EINVAL;
+		ti->error = "Cannot register snapshot origin";
+		goto bad6;
+	}
+
+	ti->private = s;
+	return 0;
+
+ bad6:
+	kcopyd_client_destroy(s->kcopyd_client);
+
+ bad5:
+	s->store.destroy(&s->store);
+
+ bad4:
+	exit_exception_table(&s->pending, pending_cache);
+	exit_exception_table(&s->complete, exception_cache);
+
+ bad3:
+	dm_put_device(ti, s->cow);
+	dm_put_device(ti, s->origin);
+
+ bad2:
+	kfree(s);
+
+ bad1:
+	return r;
+}
+
+static void snapshot_dtr(struct dm_target *ti)
+{
+	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+
+	dm_table_event(ti->table);
+
+	unregister_snapshot(s);
+
+	exit_exception_table(&s->pending, pending_cache);
+	exit_exception_table(&s->complete, exception_cache);
+
+	/* Deallocate memory used */
+	s->store.destroy(&s->store);
+
+	dm_put_device(ti, s->origin);
+	dm_put_device(ti, s->cow);
+	kcopyd_client_destroy(s->kcopyd_client);
+	kfree(s);
+}
+
+/*
+ * We hold lists of bios, using the bi_next field.
+ */
+static void queue_bio(struct bio **queue, struct bio *bio)
+{
+	bio->bi_next = *queue;
+	*queue = bio;
+}
+
+/*
+ * FIXME: inefficient.
+ */
+static void queue_bios(struct bio **queue, struct bio *bios)
+{
+	while (*queue)
+		queue = &((*queue)->bi_next);
+
+	*queue = bios;
+}
+
+/*
+ * Flush a list of buffers.
+ */
+static void flush_bios(struct bio *bio)
+{
+	struct bio *n;
+
+	DMDEBUG("begin flush");
+	while (bio) {
+		n = bio->bi_next;
+		bio->bi_next = NULL;
+		DMDEBUG("flushing %p", bio);
+		generic_make_request(bio);
+		bio = n;
+	}
+
+	blk_run_queues();
+}
+
+/*
+ * Error a list of buffers.
+ */
+static void error_bios(struct bio *bio)
+{
+	struct bio *n;
+
+	while (bio) {
+		n = bio->bi_next;
+		bio->bi_next = NULL;
+		bio_io_error(bio, bio->bi_size);
+		bio = n;
+	}
+}
+
+static struct bio *__flush_bios(struct pending_exception *pe)
+{
+	struct pending_exception *sibling;
+
+	if (list_empty(&pe->siblings))
+		return pe->origin_bios;
+
+	sibling = list_entry(pe->siblings.next,
+			     struct pending_exception, siblings);
+
+	list_del(&pe->siblings);
+
+	/* FIXME: I think there's a race on SMP machines here, add spin lock */
+	queue_bios(&sibling->origin_bios, pe->origin_bios);
+
+	return NULL;
+}
+
+static void check_free_space(struct dm_snapshot *s)
+{
+#if 0
+	sector_t numerator, denominator;
+	double n, d;
+	unsigned pc;
+
+	if (!s->store.fraction_full)
+		return;
+
+	s->store.fraction_full(&s->store, &numerator, &denominator);
+	n = (double) numerator;
+	d = (double) denominator;
+
+	pc = (int) (n / d);
+
+	if (pc >= s->last_percent + WAKE_UP_PERCENT) {
+		dm_table_event(s->table);
+		s->last_percent = pc - pc % WAKE_UP_PERCENT;
+	}
+#endif
+}
+
+static void pending_complete(struct pending_exception *pe, int success)
+{
+	struct exception *e;
+	struct dm_snapshot *s = pe->snap;
+	struct bio *flush = NULL;
+
+	if (success) {
+		e = alloc_exception();
+		if (!e) {
+			DMWARN("Unable to allocate exception.");
+			down_write(&s->lock);
+			s->store.drop_snapshot(&s->store);
+			s->valid = 0;
+			flush = __flush_bios(pe);
+			up_write(&s->lock);
+
+			error_bios(pe->snapshot_bios);
+			goto out;
+		}
+		memcpy(e, &pe->e, sizeof(*e));
+
+		/*
+		 * Add a proper exception, and remove the
+		 * in-flight exception from the list.
+		 */
+		down_write(&s->lock);
+		insert_exception(&s->complete, e);
+		remove_exception(&pe->e);
+		flush = __flush_bios(pe);
+
+		/* Submit any pending write BHs */
+		up_write(&s->lock);
+
+		flush_bios(pe->snapshot_bios);
+		DMDEBUG("Exception completed successfully.");
+
+		/* Notify any interested parties */
+		//check_free_space(s);
+
+	} else {
+		/* Read/write error - snapshot is unusable */
+		down_write(&s->lock);
+		if (s->valid)
+			DMERR("Error reading/writing snapshot");
+		s->store.drop_snapshot(&s->store);
+		s->valid = 0;
+		remove_exception(&pe->e);
+		flush = __flush_bios(pe);
+		up_write(&s->lock);
+
+		error_bios(pe->snapshot_bios);
+
+		dm_table_event(s->table);
+		DMDEBUG("Exception failed.");
+	}
+
+ out:
+	free_pending_exception(pe);
+
+	if (flush)
+		flush_bios(flush);
+}
+
+static void commit_callback(void *context, int success)
+{
+	struct pending_exception *pe = (struct pending_exception *) context;
+	pending_complete(pe, success);
+}
+
+/*
+ * Called when the copy I/O has finished.  kcopyd actually runs
+ * this code so don't block.
+ */
+static void copy_callback(int read_err, unsigned int write_err, void *context)
+{
+	struct pending_exception *pe = (struct pending_exception *) context;
+	struct dm_snapshot *s = pe->snap;
+
+	if (read_err || write_err)
+		pending_complete(pe, 0);
+
+	else
+		/* Update the metadata if we are persistent */
+		s->store.commit_exception(&s->store, &pe->e, commit_callback,
+					  pe);
+}
+
+/*
+ * Dispatches the copy operation to kcopyd.
+ */
+static inline void start_copy(struct pending_exception *pe)
+{
+	struct dm_snapshot *s = pe->snap;
+	struct io_region src, dest;
+	struct block_device *bdev = s->origin->bdev;
+	sector_t dev_size;
+
+	dev_size = get_dev_size(bdev);
+
+	src.bdev = bdev;
+	src.sector = chunk_to_sector(s, pe->e.old_chunk);
+	src.count = min(s->chunk_size, dev_size - src.sector);
+
+	dest.bdev = s->cow->bdev;
+	dest.sector = chunk_to_sector(s, pe->e.new_chunk);
+	dest.count = src.count;
+
+	/* Hand over to kcopyd */
+	DMDEBUG("starting exception copy");
+	kcopyd_copy(s->kcopyd_client,
+		    &src, 1, &dest, 0, copy_callback, pe);
+}
+
+/*
+ * Looks to see if this snapshot already has a pending exception
+ * for this chunk, otherwise it allocates a new one and inserts
+ * it into the pending table.
+ *
+ * NOTE: a write lock must be held on snap->lock before calling
+ * this.
+ */
+static struct pending_exception *
+__find_pending_exception(struct dm_snapshot *s, struct bio *bio)
+{
+	struct exception *e;
+	struct pending_exception *pe;
+	chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
+
+	/*
+	 * Is there a pending exception for this already ?
+	 */
+	e = lookup_exception(&s->pending, chunk);
+	if (e) {
+		/* cast the exception to a pending exception */
+		pe = list_entry(e, struct pending_exception, e);
+
+	} else {
+		/*
+		 * Create a new pending exception, we don't want
+		 * to hold the lock while we do this.
+		 */
+		up_write(&s->lock);
+
+		pe = alloc_pending_exception();
+		pe->e.old_chunk = chunk;
+		pe->origin_bios = pe->snapshot_bios = NULL;
+		INIT_LIST_HEAD(&pe->siblings);
+		pe->snap = s;
+		pe->started = 0;
+
+		down_write(&s->lock);
+		if (s->store.prepare_exception(&s->store, &pe->e)) {
+			free_pending_exception(pe);
+			s->valid = 0;
+			return NULL;
+		}
+
+		insert_exception(&s->pending, &pe->e);
+	}
+
+	return pe;
+}
+
+static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
+				   struct bio *bio)
+{
+	bio->bi_bdev = s->cow->bdev;
+	bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
+		(bio->bi_sector & s->chunk_mask);
+}
+
+static int snapshot_map(struct dm_target *ti, struct bio *bio,
+			union map_info *map_context)
+{
+	struct exception *e;
+	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+	int r = 1;
+	chunk_t chunk;
+	struct pending_exception *pe;
+
+	chunk = sector_to_chunk(s, bio->bi_sector);
+
+	/* Full snapshots are not usable */
+	if (!s->valid)
+		return -1;
+
+	/*
+	 * Write to snapshot - higher level takes care of RW/RO
+	 * flags so we should only get this if we are
+	 * writeable.
+	 */
+	if (bio_rw(bio) == WRITE) {
+
+		/* FIXME: should only take write lock if we need
+		 * to copy an exception */
+		down_write(&s->lock);
+
+		/* If the block is already remapped - use that, else remap it */
+		e = lookup_exception(&s->complete, chunk);
+		if (e) {
+			remap_exception(s, e, bio);
+			up_write(&s->lock);
+
+		} else {
+			pe = __find_pending_exception(s, bio);
+
+			if (!pe) {
+				s->store.drop_snapshot(&s->store);
+				s->valid = 0;
+				r = -EIO;
+				up_write(&s->lock);
+			} else {
+				remap_exception(s, &pe->e, bio);
+				queue_bio(&pe->snapshot_bios, bio);
+
+				if (!pe->started) {
+					/* this is protected by snap->lock */
+					pe->started = 1;
+					up_write(&s->lock);
+					start_copy(pe);
+				} else
+					up_write(&s->lock);
+				r = 0;
+			}
+		}
+
+	} else {
+		/*
+		 * FIXME: this read path scares me because we
+		 * always use the origin when we have a pending
+		 * exception.  However I can't think of a
+		 * situation where this is wrong - ejt.
+		 */
+
+		/* Do reads */
+		down_read(&s->lock);
+
+		/* See if it it has been remapped */
+		e = lookup_exception(&s->complete, chunk);
+		if (e)
+			remap_exception(s, e, bio);
+		else
+			bio->bi_bdev = s->origin->bdev;
+
+		up_read(&s->lock);
+	}
+
+	return r;
+}
+
+void snapshot_resume(struct dm_target *ti)
+{
+	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+
+	if (s->have_metadata)
+		return;
+
+	if (s->store.read_metadata(&s->store)) {
+		down_write(&s->lock);
+		s->valid = 0;
+		up_write(&s->lock);
+	}
+
+	s->have_metadata = 1;
+}
+
+static int snapshot_status(struct dm_target *ti, status_type_t type,
+			   char *result, unsigned int maxlen)
+{
+	struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
+	char cow[32];
+	char org[32];
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		if (!snap->valid)
+			snprintf(result, maxlen, "Invalid");
+		else {
+			if (snap->store.fraction_full) {
+				sector_t numerator, denominator;
+				snap->store.fraction_full(&snap->store,
+							  &numerator,
+							  &denominator);
+				snprintf(result, maxlen,
+					 SECTOR_FORMAT "/" SECTOR_FORMAT,
+					 numerator, denominator);
+			}
+			else
+				snprintf(result, maxlen, "Unknown");
+		}
+		break;
+
+	case STATUSTYPE_TABLE:
+		/*
+		 * kdevname returns a static pointer so we need
+		 * to make private copies if the output is to
+		 * make sense.
+		 */
+		format_dev_t(cow, snap->cow->bdev->bd_dev);
+		format_dev_t(org, snap->origin->bdev->bd_dev);
+		snprintf(result, maxlen, "%s %s %c %lld", org, cow,
+			 snap->type, snap->chunk_size);
+		break;
+	}
+
+	return 0;
+}
+
+/*-----------------------------------------------------------------
+ * Origin methods
+ *---------------------------------------------------------------*/
+static void list_merge(struct list_head *l1, struct list_head *l2)
+{
+	struct list_head *l1_n, *l2_p;
+
+	l1_n = l1->next;
+	l2_p = l2->prev;
+
+	l1->next = l2;
+	l2->prev = l1;
+
+	l2_p->next = l1_n;
+	l1_n->prev = l2_p;
+}
+
+static int __origin_write(struct list_head *snapshots, struct bio *bio)
+{
+	int r = 1, first = 1;
+	struct list_head *sl;
+	struct dm_snapshot *snap;
+	struct exception *e;
+	struct pending_exception *pe, *last = NULL;
+	chunk_t chunk;
+
+	/* Do all the snapshots on this origin */
+	list_for_each(sl, snapshots) {
+		snap = list_entry(sl, struct dm_snapshot, list);
+
+		/* Only deal with valid snapshots */
+		if (!snap->valid)
+			continue;
+
+		down_write(&snap->lock);
+
+		/*
+		 * Remember, different snapshots can have
+		 * different chunk sizes.
+		 */
+		chunk = sector_to_chunk(snap, bio->bi_sector);
+
+		/*
+		 * Check exception table to see if block
+		 * is already remapped in this snapshot
+		 * and trigger an exception if not.
+		 */
+		e = lookup_exception(&snap->complete, chunk);
+		if (!e) {
+			pe = __find_pending_exception(snap, bio);
+			if (!pe) {
+				snap->store.drop_snapshot(&snap->store);
+				snap->valid = 0;
+
+			} else {
+				if (last)
+					list_merge(&pe->siblings,
+						   &last->siblings);
+
+				last = pe;
+				r = 0;
+			}
+		}
+
+		up_write(&snap->lock);
+	}
+
+	/*
+	 * Now that we have a complete pe list we can start the copying.
+	 */
+	if (last) {
+		pe = last;
+		do {
+			down_write(&pe->snap->lock);
+			if (first)
+				queue_bio(&pe->origin_bios, bio);
+
+#if 0
+			if (!pe->started) {
+				pe->started = 1;
+				up_write(&pe->snap->lock);
+				start_copy(pe);
+			} else
+				up_write(&pe->snap->lock);
+#else
+			pe->started = 1;
+			up_write(&pe->snap->lock);
+			start_copy(pe);
+#endif
+			first = 0;
+			pe = list_entry(pe->siblings.next,
+					struct pending_exception, siblings);
+
+		} while (pe != last);
+	}
+
+	return r;
+}
+
+/*
+ * Called on a write from the origin driver.
+ */
+int do_origin(struct dm_dev *origin, struct bio *bio)
+{
+	struct origin *o;
+	int r;
+
+	down_read(&_origins_lock);
+	o = __lookup_origin(origin->bdev);
+	if (!o)
+		BUG();
+
+	r = __origin_write(&o->snapshots, bio);
+	up_read(&_origins_lock);
+
+	return r;
+}
+
+/*
+ * Origin: maps a linear range of a device, with hooks for snapshotting.
+ */
+
+/*
+ * Construct an origin mapping: <dev_path>
+ * The context for an origin is merely a 'struct dm_dev *'
+ * pointing to the real device.
+ */
+static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	int r;
+	struct dm_dev *dev;
+
+	if (argc != 1) {
+		ti->error = "dm-origin: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	r = dm_get_device(ti, argv[0], 0, ti->len,
+			  dm_table_get_mode(ti->table), &dev);
+	if (r) {
+		ti->error = "Cannot get target device";
+		return r;
+	}
+
+	ti->private = dev;
+	return 0;
+}
+
+static void origin_dtr(struct dm_target *ti)
+{
+	struct dm_dev *dev = (struct dm_dev *) ti->private;
+	dm_put_device(ti, dev);
+}
+
+static int origin_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	struct dm_dev *dev = (struct dm_dev *) ti->private;
+	bio->bi_bdev = dev->bdev;
+
+	/* Only tell snapshots if this is a write */
+	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
+}
+
+static int origin_status(struct dm_target *ti, status_type_t type, char *result,
+			 unsigned int maxlen)
+{
+	struct dm_dev *dev = (struct dm_dev *) ti->private;
+	char buffer[32];
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		format_dev_t(buffer, dev->bdev->bd_dev);
+		snprintf(result, maxlen, "%s", buffer);
+		break;
+	}
+
+	return 0;
+}
+
+static struct target_type origin_target = {
+	name:	"snapshot-origin",
+	module:	THIS_MODULE,
+	ctr:	origin_ctr,
+	dtr:	origin_dtr,
+	map:	origin_map,
+	status:	origin_status,
+};
+
+static struct target_type snapshot_target = {
+	name:	"snapshot",
+	module:	THIS_MODULE,
+	ctr:	snapshot_ctr,
+	dtr:	snapshot_dtr,
+	map:	snapshot_map,
+	resume: snapshot_resume,
+	status:	snapshot_status,
+};
+
+static int __init dm_snapshot_init(void)
+{
+	int r;
+
+	r = dm_register_target(&snapshot_target);
+	if (r) {
+		DMERR("snapshot target register failed %d", r);
+		return r;
+	}
+
+	r = dm_register_target(&origin_target);
+	if (r < 0) {
+		DMERR("Device mapper: Origin: register failed %d\n", r);
+		goto bad1;
+	}
+
+	r = init_origin_hash();
+	if (r) {
+		DMERR("init_origin_hash failed.");
+		goto bad2;
+	}
+
+	exception_cache = kmem_cache_create("dm-snapshot-ex",
+					    sizeof(struct exception),
+					    __alignof__(struct exception),
+					    0, NULL, NULL);
+	if (!exception_cache) {
+		DMERR("Couldn't create exception cache.");
+		r = -ENOMEM;
+		goto bad3;
+	}
+
+	pending_cache =
+	    kmem_cache_create("dm-snapshot-in",
+			      sizeof(struct pending_exception),
+			      __alignof__(struct pending_exception),
+			      0, NULL, NULL);
+	if (!pending_cache) {
+		DMERR("Couldn't create pending cache.");
+		r = -ENOMEM;
+		goto bad4;
+	}
+
+	pending_pool = mempool_create(128, mempool_alloc_slab,
+				      mempool_free_slab, pending_cache);
+	if (!pending_pool) {
+		DMERR("Couldn't create pending pool.");
+		r = -ENOMEM;
+		goto bad5;
+	}
+
+	return 0;
+
+      bad5:
+	kmem_cache_destroy(pending_cache);
+      bad4:
+	kmem_cache_destroy(exception_cache);
+      bad3:
+	exit_origin_hash();
+      bad2:
+	dm_unregister_target(&origin_target);
+      bad1:
+	dm_unregister_target(&snapshot_target);
+	return r;
+}
+
+static void __exit dm_snapshot_exit(void)
+{
+	int r;
+
+	r = dm_unregister_target(&snapshot_target);
+	if (r)
+		DMERR("snapshot unregister failed %d", r);
+
+	r = dm_unregister_target(&origin_target);
+	if (r)
+		DMERR("origin unregister failed %d", r);
+
+	exit_origin_hash();
+	mempool_destroy(pending_pool);
+	kmem_cache_destroy(pending_cache);
+	kmem_cache_destroy(exception_cache);
+}
+
+/* Module hooks */
+module_init(dm_snapshot_init);
+module_exit(dm_snapshot_exit);
+
+MODULE_DESCRIPTION(DM_NAME " snapshot target");
+MODULE_AUTHOR("Joe Thornber");
+MODULE_LICENSE("GPL");
--- diff/drivers/md/dm-snapshot.h	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-snapshot.h	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,161 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_SNAPSHOT_H
+#define DM_SNAPSHOT_H
+
+#include "dm.h"
+#include <linux/blkdev.h>
+
+struct exception_table {
+	uint32_t hash_mask;
+	struct list_head *table;
+};
+
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 64k - 256k.
+ */
+/* FIXME: can we get away with limiting these to a uint32_t ? */
+typedef sector_t chunk_t;
+
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ */
+struct exception {
+	struct list_head hash_list;
+
+	chunk_t old_chunk;
+	chunk_t new_chunk;
+};
+
+/*
+ * Abstraction to handle the meta/layout of exception stores (the
+ * COW device).
+ */
+struct exception_store {
+
+	/*
+	 * Destroys this object when you've finished with it.
+	 */
+	void (*destroy) (struct exception_store *store);
+
+	/*
+	 * The target shouldn't read the COW device until this is
+	 * called.
+	 */
+	int (*read_metadata) (struct exception_store *store);
+
+	/*
+	 * Find somewhere to store the next exception.
+	 */
+	int (*prepare_exception) (struct exception_store *store,
+				  struct exception *e);
+
+	/*
+	 * Update the metadata with this exception.
+	 */
+	void (*commit_exception) (struct exception_store *store,
+				  struct exception *e,
+				  void (*callback) (void *, int success),
+				  void *callback_context);
+
+	/*
+	 * The snapshot is invalid, note this in the metadata.
+	 */
+	void (*drop_snapshot) (struct exception_store *store);
+
+	/*
+	 * Return how full the snapshot is.
+	 */
+	void (*fraction_full) (struct exception_store *store,
+			       sector_t *numerator,
+			       sector_t *denominator);
+
+	struct dm_snapshot *snap;
+	void *context;
+};
+
+struct dm_snapshot {
+	struct rw_semaphore lock;
+	struct dm_table *table;
+
+	struct dm_dev *origin;
+	struct dm_dev *cow;
+
+	/* List of snapshots per Origin */
+	struct list_head list;
+
+	/* Size of data blocks saved - must be a power of 2 */
+	chunk_t chunk_size;
+	chunk_t chunk_mask;
+	chunk_t chunk_shift;
+
+	/* You can't use a snapshot if this is 0 (e.g. if full) */
+	int valid;
+	int have_metadata;
+
+	/* Used for display of table */
+	char type;
+
+	/* The last percentage we notified */
+	int last_percent;
+
+	struct exception_table pending;
+	struct exception_table complete;
+
+	/* The on disk metadata handler */
+	struct exception_store store;
+
+	struct kcopyd_client *kcopyd_client;
+};
+
+/*
+ * Used by the exception stores to load exceptions hen
+ * initialising.
+ */
+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
+
+/*
+ * Constructor and destructor for the default persistent
+ * store.
+ */
+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
+
+int dm_create_transient(struct exception_store *store,
+			struct dm_snapshot *s, int blocksize);
+
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
+{
+	return (sector & ~s->chunk_mask) >> s->chunk_shift;
+}
+
+static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
+{
+	return chunk << s->chunk_shift;
+}
+
+static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs)
+{
+	/*
+	 * There is only ever one instance of a particular block
+	 * device so we can compare pointers safely.
+	 */
+	return lhs == rhs;
+}
+
+#endif
--- diff/drivers/md/kcopyd.c	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/kcopyd.c	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,652 @@
+/*
+ * Copyright (C) 2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <asm/atomic.h>
+
+#include <linux/blkdev.h>
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "kcopyd.h"
+#include "dm-daemon.h"
+
+/* FIXME: this is only needed for the DMERR macros */
+#include "dm.h"
+
+static struct dm_daemon _kcopyd;
+
+/*-----------------------------------------------------------------
+ * Each kcopyd client has its own little pool of preallocated
+ * pages for kcopyd io.
+ *---------------------------------------------------------------*/
+struct kcopyd_client {
+	struct list_head list;
+
+	spinlock_t lock;
+	struct list_head pages;
+	unsigned int nr_pages;
+	unsigned int nr_free_pages;
+};
+
+static inline void __push_page(struct kcopyd_client *kc, struct page *p)
+{
+	list_add(&p->list, &kc->pages);
+	kc->nr_free_pages++;
+}
+
+static inline struct page *__pop_page(struct kcopyd_client *kc)
+{
+	struct page *p;
+
+	p = list_entry(kc->pages.next, struct page, list);
+	list_del(&p->list);
+	kc->nr_free_pages--;
+
+	return p;
+}
+
+static int kcopyd_get_pages(struct kcopyd_client *kc,
+			    unsigned int nr, struct list_head *pages)
+{
+	struct page *p;
+	INIT_LIST_HEAD(pages);
+
+	spin_lock(&kc->lock);
+	if (kc->nr_free_pages < nr) {
+		spin_unlock(&kc->lock);
+		return -ENOMEM;
+	}
+
+	while (nr--) {
+		p = __pop_page(kc);
+		list_add(&p->list, pages);
+	}
+	spin_unlock(&kc->lock);
+
+	return 0;
+}
+
+static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
+{
+	struct list_head *tmp, *tmp2;
+
+	spin_lock(&kc->lock);
+	list_for_each_safe (tmp, tmp2, pages)
+		__push_page(kc, list_entry(tmp, struct page, list));
+	spin_unlock(&kc->lock);
+}
+
+/*
+ * These three functions resize the page pool.
+ */
+static void drop_pages(struct list_head *pages)
+{
+	struct page *p;
+	struct list_head *tmp, *tmp2;
+
+	list_for_each_safe (tmp, tmp2, pages) {
+		p = list_entry(tmp, struct page, list);
+		ClearPageLocked(p);
+		__free_page(p);
+	}
+}
+
+static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
+{
+	unsigned int i;
+	struct page *p;
+	LIST_HEAD(new);
+
+	for (i = 0; i < nr; i++) {
+		p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			drop_pages(&new);
+			return -ENOMEM;
+		}
+
+		SetPageLocked(p);
+		list_add(&p->list, &new);
+	}
+
+	kcopyd_put_pages(kc, &new);
+	kc->nr_pages += nr;
+	return 0;
+}
+
+static void client_free_pages(struct kcopyd_client *kc)
+{
+	BUG_ON(kc->nr_free_pages != kc->nr_pages);
+	drop_pages(&kc->pages);
+	kc->nr_free_pages = kc->nr_pages = 0;
+}
+
+/*-----------------------------------------------------------------
+ * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
+ * for this reason we use a mempool to prevent the client from
+ * ever having to do io (which could cause a deadlock).
+ *---------------------------------------------------------------*/
+struct kcopyd_job {
+	struct kcopyd_client *kc;
+	struct list_head list;
+	unsigned long flags;
+
+	/*
+	 * Error state of the job.
+	 */
+	int read_err;
+	unsigned int write_err;
+
+	/*
+	 * Either READ or WRITE
+	 */
+	int rw;
+	struct io_region source;
+
+	/*
+	 * The destinations for the transfer.
+	 */
+	unsigned int num_dests;
+	struct io_region dests[KCOPYD_MAX_REGIONS];
+
+	sector_t offset;
+	unsigned int nr_pages;
+	struct list_head pages;
+
+	/*
+	 * Set this to ensure you are notified when the job has
+	 * completed.  'context' is for callback to use.
+	 */
+	kcopyd_notify_fn fn;
+	void *context;
+
+	/*
+	 * These fields are only used if the job has been split
+	 * into more manageable parts.
+	 */
+	struct semaphore lock;
+	atomic_t sub_jobs;
+	sector_t progress;
+};
+
+/* FIXME: this should scale with the number of pages */
+#define MIN_JOBS 512
+
+static kmem_cache_t *_job_cache;
+static mempool_t *_job_pool;
+
+/*
+ * We maintain three lists of jobs:
+ *
+ * i)   jobs waiting for pages
+ * ii)  jobs that have pages, and are waiting for the io to be issued.
+ * iii) jobs that have completed.
+ *
+ * All three of these are protected by job_lock.
+ */
+static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
+
+static LIST_HEAD(_complete_jobs);
+static LIST_HEAD(_io_jobs);
+static LIST_HEAD(_pages_jobs);
+
+static int jobs_init(void)
+{
+	INIT_LIST_HEAD(&_complete_jobs);
+	INIT_LIST_HEAD(&_io_jobs);
+	INIT_LIST_HEAD(&_pages_jobs);
+
+	_job_cache = kmem_cache_create("kcopyd-jobs",
+				       sizeof(struct kcopyd_job),
+				       __alignof__(struct kcopyd_job),
+				       0, NULL, NULL);
+	if (!_job_cache)
+		return -ENOMEM;
+
+	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
+				   mempool_free_slab, _job_cache);
+	if (!_job_pool) {
+		kmem_cache_destroy(_job_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void jobs_exit(void)
+{
+	BUG_ON(!list_empty(&_complete_jobs));
+	BUG_ON(!list_empty(&_io_jobs));
+	BUG_ON(!list_empty(&_pages_jobs));
+
+	mempool_destroy(_job_pool);
+	kmem_cache_destroy(_job_cache);
+}
+
+/*
+ * Functions to push and pop a job onto the head of a given job
+ * list.
+ */
+static inline struct kcopyd_job *pop(struct list_head *jobs)
+{
+	struct kcopyd_job *job = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&_job_lock, flags);
+
+	if (!list_empty(jobs)) {
+		job = list_entry(jobs->next, struct kcopyd_job, list);
+		list_del(&job->list);
+	}
+	spin_unlock_irqrestore(&_job_lock, flags);
+
+	return job;
+}
+
+static inline void push(struct list_head *jobs, struct kcopyd_job *job)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&_job_lock, flags);
+	list_add_tail(&job->list, jobs);
+	spin_unlock_irqrestore(&_job_lock, flags);
+}
+
+/*
+ * These three functions process 1 item from the corresponding
+ * job list.
+ *
+ * They return:
+ * < 0: error
+ *   0: success
+ * > 0: can't process yet.
+ */
+static int run_complete_job(struct kcopyd_job *job)
+{
+	void *context = job->context;
+	int read_err = job->read_err;
+	unsigned int write_err = job->write_err;
+	kcopyd_notify_fn fn = job->fn;
+
+	kcopyd_put_pages(job->kc, &job->pages);
+	mempool_free(job, _job_pool);
+	fn(read_err, write_err, context);
+	return 0;
+}
+
+static unsigned _pending;
+static void complete_io(unsigned long error, void *context)
+{
+	struct kcopyd_job *job = (struct kcopyd_job *) context;
+
+	_pending--;
+	if (error) {
+		if (job->rw == WRITE)
+			job->write_err &= error;
+		else
+			job->read_err = 1;
+
+		if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
+			push(&_complete_jobs, job);
+			dm_daemon_wake(&_kcopyd);
+			return;
+		}
+	}
+
+	if (job->rw == WRITE)
+		push(&_complete_jobs, job);
+
+	else {
+		job->rw = WRITE;
+		push(&_io_jobs, job);
+	}
+
+	dm_daemon_wake(&_kcopyd);
+}
+
+/*
+ * Request io on as many buffer heads as we can currently get for
+ * a particular job.
+ */
+static int run_io_job(struct kcopyd_job *job)
+{
+	int r;
+
+	if (job->rw == READ)
+		r = dm_io_async(1, &job->source, job->rw,
+				list_entry(job->pages.next, struct page, list),
+				job->offset, complete_io, job);
+
+	else
+		r = dm_io_async(job->num_dests, job->dests, job->rw,
+				list_entry(job->pages.next, struct page, list),
+				job->offset, complete_io, job);
+
+	if (!r)
+		_pending++;
+
+	return r;
+}
+
+static int run_pages_job(struct kcopyd_job *job)
+{
+	int r;
+
+	job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
+				  PAGE_SIZE >> 9);
+	r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
+	if (!r) {
+		/* this job is ready for io */
+		push(&_io_jobs, job);
+		return 0;
+	}
+
+	if (r == -ENOMEM)
+		/* can't complete now */
+		return 1;
+
+	return r;
+}
+
+/*
+ * Run through a list for as long as possible.  Returns the count
+ * of successful jobs.
+ */
+static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
+{
+	struct kcopyd_job *job;
+	int r, count = 0;
+
+	while ((job = pop(jobs))) {
+
+		r = fn(job);
+
+		if (r < 0) {
+			/* error this rogue job */
+			if (job->rw == WRITE)
+				job->write_err = (unsigned int) -1;
+			else
+				job->read_err = 1;
+			push(&_complete_jobs, job);
+			break;
+		}
+
+		if (r > 0) {
+			/*
+			 * We couldn't service this job ATM, so
+			 * push this job back onto the list.
+			 */
+			push(jobs, job);
+			break;
+		}
+
+		count++;
+	}
+
+	return count;
+}
+
+/*
+ * kcopyd does this every time it's woken up.
+ */
+static void do_work(void)
+{
+	/*
+	 * The order that these are called is *very* important.
+	 * complete jobs can free some pages for pages jobs.
+	 * Pages jobs when successful will jump onto the io jobs
+	 * list.  io jobs call wake when they complete and it all
+	 * starts again.
+	 */
+	process_jobs(&_complete_jobs, run_complete_job);
+	process_jobs(&_pages_jobs, run_pages_job);
+	process_jobs(&_io_jobs, run_io_job);
+
+	blk_run_queues();
+}
+
+/*
+ * If we are copying a small region we just dispatch a single job
+ * to do the copy, otherwise the io has to be split up into many
+ * jobs.
+ */
+static void dispatch_job(struct kcopyd_job *job)
+{
+	push(&_pages_jobs, job);
+	dm_daemon_wake(&_kcopyd);
+}
+
+#define SUB_JOB_SIZE 128
+static void segment_complete(int read_err,
+			     unsigned int write_err, void *context)
+{
+	/* FIXME: tidy this function */
+	sector_t progress = 0;
+	sector_t count = 0;
+	struct kcopyd_job *job = (struct kcopyd_job *) context;
+
+	down(&job->lock);
+
+	/* update the error */
+	if (read_err)
+		job->read_err = 1;
+
+	if (write_err)
+		job->write_err &= write_err;
+
+	/*
+	 * Only dispatch more work if there hasn't been an error.
+	 */
+	if ((!job->read_err && !job->write_err) ||
+	    test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
+		/* get the next chunk of work */
+		progress = job->progress;
+		count = job->source.count - progress;
+		if (count) {
+			if (count > SUB_JOB_SIZE)
+				count = SUB_JOB_SIZE;
+
+			job->progress += count;
+		}
+	}
+	up(&job->lock);
+
+	if (count) {
+		int i;
+		struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
+
+		memcpy(sub_job, job, sizeof(*job));
+		sub_job->source.sector += progress;
+		sub_job->source.count = count;
+
+		for (i = 0; i < job->num_dests; i++) {
+			sub_job->dests[i].sector += progress;
+			sub_job->dests[i].count = count;
+		}
+
+		sub_job->fn = segment_complete;
+		sub_job->context = job;
+		dispatch_job(sub_job);
+
+	} else if (atomic_dec_and_test(&job->sub_jobs)) {
+
+		/*
+		 * To avoid a race we must keep the job around
+		 * until after the notify function has completed.
+		 * Otherwise the client may try and stop the job
+		 * after we've completed.
+		 */
+		job->fn(read_err, write_err, job->context);
+		mempool_free(job, _job_pool);
+	}
+}
+
+/*
+ * Create some little jobs that will do the move between
+ * them.
+ */
+#define SPLIT_COUNT 8
+static void split_job(struct kcopyd_job *job)
+{
+	int i;
+
+	atomic_set(&job->sub_jobs, SPLIT_COUNT);
+	for (i = 0; i < SPLIT_COUNT; i++)
+		segment_complete(0, 0u, job);
+}
+
+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
+		unsigned int num_dests, struct io_region *dests,
+		unsigned int flags, kcopyd_notify_fn fn, void *context)
+{
+	struct kcopyd_job *job;
+
+	/*
+	 * Allocate a new job.
+	 */
+	job = mempool_alloc(_job_pool, GFP_NOIO);
+
+	/*
+	 * set up for the read.
+	 */
+	job->kc = kc;
+	job->flags = flags;
+	job->read_err = 0;
+	job->write_err = 0;
+	job->rw = READ;
+
+	memcpy(&job->source, from, sizeof(*from));
+
+	job->num_dests = num_dests;
+	memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
+
+	job->offset = 0;
+	job->nr_pages = 0;
+	INIT_LIST_HEAD(&job->pages);
+
+	job->fn = fn;
+	job->context = context;
+
+	if (job->source.count < SUB_JOB_SIZE)
+		dispatch_job(job);
+
+	else {
+		init_MUTEX(&job->lock);
+		job->progress = 0;
+		split_job(job);
+	}
+
+	return 0;
+}
+
+/*
+ * Cancels a kcopyd job, eg. someone might be deactivating a
+ * mirror.
+ */
+int kcopyd_cancel(struct kcopyd_job *job, int block)
+{
+	/* FIXME: finish */
+	return -1;
+}
+
+/*-----------------------------------------------------------------
+ * Unit setup
+ *---------------------------------------------------------------*/
+static DECLARE_MUTEX(_client_lock);
+static LIST_HEAD(_clients);
+
+static int client_add(struct kcopyd_client *kc)
+{
+	down(&_client_lock);
+	list_add(&kc->list, &_clients);
+	up(&_client_lock);
+	return 0;
+}
+
+static void client_del(struct kcopyd_client *kc)
+{
+	down(&_client_lock);
+	list_del(&kc->list);
+	up(&_client_lock);
+}
+
+int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
+{
+	int r = 0;
+	struct kcopyd_client *kc;
+
+	kc = kmalloc(sizeof(*kc), GFP_KERNEL);
+	if (!kc)
+		return -ENOMEM;
+
+	kc->lock = SPIN_LOCK_UNLOCKED;
+	INIT_LIST_HEAD(&kc->pages);
+	kc->nr_pages = kc->nr_free_pages = 0;
+	r = client_alloc_pages(kc, nr_pages);
+	if (r) {
+		kfree(kc);
+		return r;
+	}
+
+	r = dm_io_get(nr_pages);
+	if (r) {
+		client_free_pages(kc);
+		kfree(kc);
+		return r;
+	}
+
+	r = client_add(kc);
+	if (r) {
+		dm_io_put(nr_pages);
+		client_free_pages(kc);
+		kfree(kc);
+		return r;
+	}
+
+	*result = kc;
+	return 0;
+}
+
+void kcopyd_client_destroy(struct kcopyd_client *kc)
+{
+	dm_io_put(kc->nr_pages);
+	client_free_pages(kc);
+	client_del(kc);
+	kfree(kc);
+}
+
+
+int __init kcopyd_init(void)
+{
+	int r;
+
+	r = jobs_init();
+	if (r)
+		return r;
+
+	r = dm_daemon_start(&_kcopyd, "kcopyd", do_work);
+	if (r)
+		jobs_exit();
+
+	return r;
+}
+
+void kcopyd_exit(void)
+{
+	jobs_exit();
+	dm_daemon_stop(&_kcopyd);
+}
+
+EXPORT_SYMBOL(kcopyd_client_create);
+EXPORT_SYMBOL(kcopyd_client_destroy);
+EXPORT_SYMBOL(kcopyd_copy);
+EXPORT_SYMBOL(kcopyd_cancel);
--- diff/drivers/md/kcopyd.h	1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/kcopyd.h	2003-11-26 10:18:32.000000000 +0000
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2001 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_KCOPYD_H
+#define DM_KCOPYD_H
+
+#include "dm-io.h"
+
+int kcopyd_init(void);
+void kcopyd_exit(void);
+
+/* FIXME: make this configurable */
+#define KCOPYD_MAX_REGIONS 8
+
+#define KCOPYD_IGNORE_ERROR 1
+
+/*
+ * To use kcopyd you must first create a kcopyd client object.
+ */
+struct kcopyd_client;
+int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
+void kcopyd_client_destroy(struct kcopyd_client *kc);
+
+/*
+ * Submit a copy job to kcopyd.  This is built on top of the
+ * previous three fns.
+ *
+ * read_err is a boolean,
+ * write_err is a bitset, with 1 bit for each destination region
+ */
+typedef void (*kcopyd_notify_fn)(int read_err,
+				 unsigned int write_err, void *context);
+
+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
+		unsigned int num_dests, struct io_region *dests,
+		unsigned int flags, kcopyd_notify_fn fn, void *context);
+
+#endif