From: Mike Snitzer <snitzer@redhat.com>

[AGK - untested alternative version avoiding storing handover flag]

Permit in-use snapshot exception data to be 'handed over' from one
snapshot instance to another.  This is a pre-requisite for patches
that allow the changes made in a snapshot device to be merged back into
its origin device and also allows device resizing.

The basic call sequence is:

  dmsetup load new_snapshot (referencing the existing in-use cow device)
     - the ctr code detects that the cow is already in use and links the
       two snapshot target instances together
  dmsetup suspend original_snapshot
  dmsetup resume new_snapshot
     - the new_snapshot becomes live, and if anything now tries to access
       the original one it will receive EIO
  dmsetup remove original_snapshot

(There can only be two snapshot targets referencing the same cow device
simultaneously.)

Snapshot locking is such that:
0) snapshot that is passed to find_snapshot_using_cow() is not locked
1) only need handover-source lock to determine if handover is needed
   - handover-source lock is primary lock used in handover code paths
   - only need handover-destination lock before handover_exceptions()
2) handover-source lock is taken before handover-destination lock
   - but this is only ever needed before calling handover_exceptions()

Signed-off-by: Mike Snitzer <snitzer@redhat.com>

---
 drivers/md/dm-snap.c |  242 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 204 insertions(+), 38 deletions(-)

Index: linux-2.6.32-rc6/drivers/md/dm-snap.c
===================================================================
--- linux-2.6.32-rc6.orig/drivers/md/dm-snap.c
+++ linux-2.6.32-rc6/drivers/md/dm-snap.c
@@ -303,35 +303,92 @@ static void __insert_origin(struct origi
 }
 
 /*
+ * Returns number of registered snapshots with same cow device.
+ * Returns 1: snap_src, NULL - normal snapshot
+ * Returns 2: snap_src, NULL - handed over, waiting for old to be deleted
+ * Returns 2: snap_src, snap_dest  - waiting for handover
+ * Returns 1: NULL, snap_dest - source got destroyed before handover
+ * Returns 0: NULL, NULL  - first new snapshot
+ */
+static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
+					struct dm_snapshot **snap_src,
+					struct dm_snapshot **snap_dest)
+{
+	struct dm_snapshot *s;
+	struct origin *o;
+	int count = 0;
+	int active;
+
+	o = __lookup_origin(snap->origin->bdev);
+	if (!o)
+		goto out;
+
+	list_for_each_entry(s, &o->snapshots, list) {
+		if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
+			continue;
+
+		down_read(&s->lock);
+		active = s->active;
+		up_read(&s->lock);
+
+		if (active) {
+			if (snap_src)
+				*snap_src = s;
+		} else if (snap_dest)
+			*snap_dest = s;
+
+		count++;
+	}
+
+out:
+	return count;
+}
+
+static int find_snapshots_sharing_cow(struct dm_snapshot *snap,
+				      struct dm_snapshot **snap_src,
+				      struct dm_snapshot **snap_dest)
+{
+	int count;
+
+	down_read(&_origins_lock);
+	count = __find_snapshots_sharing_cow(snap, snap_src, snap_dest);
+	up_read(&_origins_lock);
+
+	return count;
+}
+
+/*
  * Make a note of the snapshot and its origin so we can look it
  * up when the origin has a write on it.
+ *
+ * Also validate snapshot exception store handovers.
+ * On success, returns 1 if this registration is a handover destination,
+ * otherwise returns 0.
  */
-static int register_snapshot(struct dm_snapshot *snap,
-			     int origin_exists)
+static int register_snapshot(struct dm_snapshot *snap)
 {
-	struct dm_snapshot *l;
+	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 	struct origin *o, *new_o = NULL;
 	struct block_device *bdev = snap->origin->bdev;
 	int r = 0;
 
-	if (!origin_exists) {
-		new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
-		if (!new_o)
-			return -ENOMEM;
-	}
+	new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
+	if (!new_o)
+		return -ENOMEM;
 
 	down_write(&_origins_lock);
-	o = __lookup_origin(bdev);
 
+	/* Does snapshot need exceptions handed over to it? */
+	if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest) == 2) ||
+	    snap_dest) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	o = __lookup_origin(bdev);
 	if (o)
 		kfree(new_o);
 	else {
-		if (origin_exists) {
-			DMERR("register_snapshot failed to find origin.");
-			r = -EINVAL;
-			goto out;
-		}
-
 		/* New origin */
 		o = new_o;
 
@@ -342,18 +399,40 @@ static int register_snapshot(struct dm_s
 		__insert_origin(o);
 	}
 
-	/* Sort the list according to chunk size, largest-first smallest-last */
-	list_for_each_entry(l, &o->snapshots, list)
-		if (l->store->chunk_size < snap->store->chunk_size)
-			break;
-	list_add_tail(&snap->list, &l->list);
+	/* Position in list is irrelevant as there's no I/O yet. */
+	list_add_tail(&snap->list, &o->snapshots);
 
+	if (snap_src)
+		r = 1;
 out:
 	up_write(&_origins_lock);
 
 	return r;
 }
 
+/*
+ * Move snapshot to correct place in list according to chunk size.
+ */
+static void reregister_snapshot(struct dm_snapshot *s)
+{
+	struct dm_snapshot *l;
+	struct origin *o;
+	struct block_device *bdev = s->origin->bdev;
+
+	down_write(&_origins_lock);
+	o = __lookup_origin(bdev);
+
+	list_del(&s->list);
+
+	/* Sort the list according to chunk size, largest-first smallest-last */
+	list_for_each_entry(l, &o->snapshots, list)
+		if (l->store->chunk_size < s->store->chunk_size)
+			break;
+	list_add_tail(&s->list, &l->list);
+
+	up_write(&_origins_lock);
+}
+
 static void unregister_snapshot(struct dm_snapshot *s)
 {
 	struct origin *o;
@@ -362,7 +441,7 @@ static void unregister_snapshot(struct d
 	o = __lookup_origin(s->origin->bdev);
 
 	list_del(&s->list);
-	if (list_empty(&o->snapshots)) {
+	if (o && list_empty(&o->snapshots)) {
 		list_del(&o->hash_list);
 		kfree(o);
 	}
@@ -672,6 +751,7 @@ static int snapshot_ctr(struct dm_target
 	s->suspended = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
 	init_rwsem(&s->lock);
+	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
 
 	/* Allocate hash table for COW data */
@@ -706,7 +786,31 @@ static int snapshot_ctr(struct dm_target
 
 	spin_lock_init(&s->tracked_chunk_lock);
 
-	/* Metadata must only be loaded into one table at once */
+	bio_list_init(&s->queued_bios);
+	INIT_WORK(&s->queued_bios_work, flush_queued_bios);
+
+	ti->private = s;
+	ti->num_flush_requests = 1;
+
+	/* Add snapshot to the list of snapshots for this origin */
+	/* Exceptions aren't triggered till snapshot_resume() is called */
+	r = register_snapshot(s);
+	if (r == -ENOMEM) {
+		ti->error = "Snapshot origin struct allocation failed";
+		goto bad_load_and_register;
+	} else if (r < 0) {
+		ti->error = "Snapshot cow pairing for exception table handover "
+			    "failed";
+		goto bad_load_and_register;
+	}
+
+	/*
+	 * Metadata must only be loaded into one table at once, so skip this
+	 * if metadata will be handed over during resume.
+	 */
+	if (r > 0)
+		return 0;
+
 	r = s->store->type->read_metadata(s->store, dm_add_exception,
 					  (void *)s);
 	if (r < 0) {
@@ -717,25 +821,11 @@ static int snapshot_ctr(struct dm_target
 		DMWARN("Snapshot is marked invalid.");
 	}
 
-	bio_list_init(&s->queued_bios);
-	INIT_WORK(&s->queued_bios_work, flush_queued_bios);
-
 	if (!s->store->chunk_size) {
 		ti->error = "Chunk size not set";
 		goto bad_load_and_register;
 	}
-
-	/* Add snapshot to the list of snapshots for this origin */
-	/* Exceptions aren't triggered till snapshot_resume() is called */
-	if (register_snapshot(s, 0)) {
-		r = -EINVAL;
-		ti->error = "Cannot register snapshot origin";
-		goto bad_load_and_register;
-	}
-
-	ti->private = s;
 	ti->split_io = s->store->chunk_size;
-	ti->num_flush_requests = 1;
 
 	return 0;
 
@@ -777,15 +867,53 @@ static void __free_exceptions(struct dm_
 	dm_exception_table_exit(&s->complete, exception_cache);
 }
 
+static void handover_exceptions(struct dm_snapshot *snap_src,
+				struct dm_snapshot *snap_dest)
+{
+	union {
+		struct dm_exception_table table_swap;
+		struct dm_exception_store *store_swap;
+	} u;
+
+	/* swap exceptions tables and stores */
+	u.table_swap = snap_dest->complete;
+	snap_dest->complete = snap_src->complete;
+	snap_src->complete = u.table_swap;
+	u.store_swap = snap_dest->store;
+	snap_dest->store = snap_src->store;
+	snap_src->store = u.store_swap;
+
+	snap_dest->store->snap = snap_dest;
+	snap_src->store->snap = snap_src;
+
+	/* reset split_io to store's chunk_size */
+	if (snap_dest->ti->split_io != snap_dest->store->chunk_size)
+		snap_dest->ti->split_io = snap_dest->store->chunk_size;
+
+	/* transfer 'valid' state, mark snap_src snapshot invalid */
+	snap_dest->valid = snap_src->valid;
+	snap_src->valid = 0;
+}
+
 static void snapshot_dtr(struct dm_target *ti)
 {
 #ifdef CONFIG_DM_DEBUG
 	int i;
 #endif
 	struct dm_snapshot *s = ti->private;
+	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 
 	flush_workqueue(ksnapd);
 
+	/* Check whether exception handover must be cancelled */
+	find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	if (snap_src && snap_dest && (s == snap_src)) {
+		down_write(&snap_dest->lock);
+		snap_dest->valid = 0;
+		up_write(&snap_dest->lock);
+		DMERR("Cancelling snapshot handover.");
+	}
+
 	/* Prevent further origin writes from using this snapshot. */
 	/* After this returns there can be no new kcopyd jobs. */
 	unregister_snapshot(s);
@@ -1198,14 +1326,51 @@ static void snapshot_postsuspend(struct 
 	up_write(&s->lock);
 }
 
+static int snapshot_preresume(struct dm_target *ti)
+{
+	int r = 0;
+	struct dm_snapshot *s = ti->private;
+	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+
+	(void) find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	if (snap_src && snap_dest) {
+		down_write(&snap_src->lock);
+		if (s == snap_src) {
+			DMERR("Unable to resume snapshot source until "
+			      "handover complete.");
+			r = -EINVAL;
+		} else if (!snap_src->suspended) {
+			DMERR("Unable to perform snapshot handover until "
+			      "source is suspended.");
+				r = -EINVAL;
+		}
+		up_write(&snap_src->lock);
+	}
+
+	return r;
+}
+
 static void snapshot_resume(struct dm_target *ti)
 {
 	struct dm_snapshot *s = ti->private;
+	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 
-	down_write(&s->lock);
+	(void) find_snapshots_sharing_cow(s, &snap_src, &snap_dest);
+	if (snap_src && snap_dest) {
+		down_write_nested(&snap_src->lock, SINGLE_DEPTH_NESTING);
+		down_write(&snap_dest->lock);
+		handover_exceptions(snap_src, snap_dest);
+		up_write(&snap_dest->lock);
+		up_write(&snap_src->lock);
+	}
+
+	/* Now we have correct chunk size, reregister */
+	reregister_snapshot(snap_dest);
+
+	down_write(&snap_dest->lock);
 	s->active = 1;
 	s->suspended = 0;
-	up_write(&s->lock);
+	up_write(&snap_dest->lock);
 }
 
 static int snapshot_status(struct dm_target *ti, status_type_t type,
@@ -1518,6 +1683,7 @@ static struct target_type snapshot_targe
 	.map     = snapshot_map,
 	.end_io  = snapshot_end_io,
 	.postsuspend = snapshot_postsuspend,
+	.preresume  = snapshot_preresume,
 	.resume  = snapshot_resume,
 	.status  = snapshot_status,
 	.iterate_devices = snapshot_iterate_devices,