From: Benjamin Marzinski <bmarzins@redhat.com>

When the last path in a priority group fails, instead of switching
to the next PG immediately, wait for a configurable amount of time
in case any paths recover.

In some configurations switching PG is an expensive exercise that
you would prefer to avoid when there are transient failures.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---

 drivers/md/dm-mpath.c |  114 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 107 insertions(+), 7 deletions(-)

Index: linux-2.6.19/drivers/md/dm-mpath.c
===================================================================
--- linux-2.6.19.orig/drivers/md/dm-mpath.c	2006-12-06 20:49:32.000000000 +0000
+++ linux-2.6.19/drivers/md/dm-mpath.c	2006-12-06 20:49:42.000000000 +0000
@@ -53,6 +53,12 @@ struct priority_group {
 	struct list_head pgpaths;
 };
 
+enum pg_timeout_state {
+	PG_TIMEOUT_NONE = 0,
+	PG_TIMEOUT_IN_PROGRESS,
+	PG_TIMEOUT_FAILED
+};
+
 /* Multipath context */
 struct multipath {
 	struct list_head list;
@@ -76,6 +82,10 @@ struct multipath {
 	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */
 	unsigned saved_queue_if_no_path;/* Saved state during suspension */
 
+	unsigned pg_timeout;		/* time to wait before switching PGs */
+	enum pg_timeout_state pg_timeout_status;
+	struct timer_list pg_timer;
+
 	struct work_struct process_queued_ios;
 	struct bio_list queued_ios;
 	unsigned queue_size;
@@ -106,6 +116,7 @@ static kmem_cache_t *_mpio_cache;
 struct workqueue_struct *kmultipathd;
 static void process_queued_ios(void *data);
 static void trigger_event(void *data);
+static void pg_timeout_event(unsigned long data);
 
 
 /*-----------------------------------------------
@@ -173,6 +184,9 @@ static struct multipath *alloc_multipath
 		INIT_LIST_HEAD(&m->priority_groups);
 		spin_lock_init(&m->lock);
 		m->queue_io = 1;
+		init_timer(&m->pg_timer);
+		m->pg_timer.function = pg_timeout_event;
+		m->pg_timer.data = (unsigned long)m;
 		INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
 		INIT_WORK(&m->trigger_event, trigger_event, m);
 		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
@@ -211,6 +225,28 @@ static void free_multipath(struct multip
  * Path selection
  *-----------------------------------------------*/
 
+static void pg_timeout_event(unsigned long data)
+{
+	unsigned long flags;
+	struct multipath *m = (struct multipath *)data;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	if (m->pg_timeout_status == PG_TIMEOUT_NONE)
+		goto out;
+
+	m->pg_timeout_status = PG_TIMEOUT_FAILED;
+
+	if (!m->pg_init_required && !m->pg_init_in_progress){
+		m->queue_io = 0;
+		if (m->queue_size)
+			queue_work(kmultipathd, &m->process_queued_ios);
+	}
+
+out:
+	spin_unlock_irqrestore(&m->lock, flags);
+}
+
 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
 {
 	struct hw_handler *hwh = &m->hw_handler;
@@ -256,12 +292,26 @@ static void __choose_pgpath(struct multi
 		pg = m->next_pg;
 		m->next_pg = NULL;
 		if (!__choose_path_in_pg(m, pg))
-			return;
+			goto out;
 	}
 
 	/* Don't change PG until it has no remaining paths */
-	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
-		return;
+	if (m->current_pg) {
+		if (!__choose_path_in_pg(m, m->current_pg))
+			goto out;
+		if (m->pg_timeout) {
+			if (m->pg_timeout_status == PG_TIMEOUT_IN_PROGRESS)
+				return;
+			if (m->pg_timeout_status == PG_TIMEOUT_NONE) {
+				m->queue_io = 1;
+				m->current_pgpath = NULL;
+				m->pg_timeout_status = PG_TIMEOUT_IN_PROGRESS;
+				mod_timer(&m->pg_timer,
+					  jiffies + m->pg_timeout * HZ);
+				return;
+			}
+		}
+	}
 
 	/*
 	 * Loop through priority groups until we find a valid path.
@@ -273,13 +323,20 @@ static void __choose_pgpath(struct multi
 			if (pg->bypassed == bypassed)
 				continue;
 			if (!__choose_path_in_pg(m, pg))
-				return;
+				goto out;
 		}
 	} while (bypassed--);
 
 failed:
 	m->current_pgpath = NULL;
 	m->current_pg = NULL;
+	return;
+
+out:
+	if (m->pg_timeout_status != PG_TIMEOUT_NONE) {
+		del_timer(&m->pg_timer);
+		m->pg_timeout_status = PG_TIMEOUT_NONE;
+	}
 }
 
 /*
@@ -319,6 +376,7 @@ static int map_io(struct multipath *m, s
 		m->queue_size--;
 
 	if ((pgpath && m->queue_io) ||
+	    (m->pg_timeout_status == PG_TIMEOUT_IN_PROGRESS) ||
 	    (!pgpath && m->queue_if_no_path)) {
 		/* Queue for the daemon to resubmit */
 		bio_list_add(&m->queued_ios, bio);
@@ -419,7 +477,8 @@ static void process_queued_ios(void *dat
 	pgpath = m->current_pgpath;
 
 	if ((pgpath && !m->queue_io) ||
-	    (!pgpath && !m->queue_if_no_path))
+	    (!pgpath && !m->queue_if_no_path &&
+	     (m->pg_timeout_status != PG_TIMEOUT_IN_PROGRESS)))
 		must_queue = 0;
 
 	if (m->pg_init_required && !m->pg_init_in_progress) {
@@ -868,7 +927,9 @@ static int reinstate_path(struct pgpath 
 	pgpath->path.is_active = 1;
 
 	m->current_pgpath = NULL;
-	if (!m->nr_valid_paths++ && m->queue_size)
+	if ((!m->nr_valid_paths++ ||
+	     (m->pg_timeout_status == PG_TIMEOUT_IN_PROGRESS)) &&
+	    m->queue_size)
 		queue_work(kmultipathd, &m->process_queued_ios);
 
 	queue_work(kmultipathd, &m->trigger_event);
@@ -943,6 +1004,8 @@ static int switch_pg_num(struct multipat
 		m->current_pg = NULL;
 		m->next_pg = pg;
 	}
+	if ((m->pg_timeout_status == PG_TIMEOUT_IN_PROGRESS) && m->queue_size)
+		queue_work(kmultipathd, &m->process_queued_ios);
 	spin_unlock_irqrestore(&m->lock, flags);
 
 	queue_work(kmultipathd, &m->trigger_event);
@@ -973,6 +1036,40 @@ static int bypass_pg_num(struct multipat
 	return 0;
 }
 
+static int set_pg_timeout(struct multipath *m, const char *timeoutstr)
+{
+	unsigned timeout;
+	unsigned long flags;
+	enum pg_timeout_state status;
+
+	if (!timeoutstr || (sscanf(timeoutstr, "%u", &timeout) != 1)) {
+		DMWARN("invalid timeout number supplied to set_pg_timeout");
+		return -EINVAL;
+	}
+	spin_lock_irqsave(&m->lock, flags);
+	m->pg_timeout = timeout;
+
+	if (timeout)
+		goto out;
+
+	status = m->pg_timeout_status;
+	m->pg_timeout_status = PG_TIMEOUT_NONE;
+	if (status != PG_TIMEOUT_IN_PROGRESS)
+		goto out;
+
+	del_timer(&m->pg_timer);
+	if (!m->pg_init_required && !m->pg_init_in_progress){
+		m->queue_io = 0;
+		if (m->queue_size)
+			queue_work(kmultipathd,
+				   &m->process_queued_ios);
+	}
+
+out:
+	spin_unlock_irqrestore(&m->lock, flags);
+	return 0;
+}
+
 /*
  * pg_init must call this when it has completed its initialisation
  */
@@ -997,7 +1094,8 @@ void dm_pg_init_complete(struct dm_path 
 	if (err_flags) {
 		m->current_pgpath = NULL;
 		m->current_pg = NULL;
-	} else if (!m->pg_init_required)
+	} else if (!m->pg_init_required &&
+		   (m->pg_timeout_status != PG_TIMEOUT_IN_PROGRESS))
 		m->queue_io = 0;
 
 	m->pg_init_in_progress = 0;
@@ -1257,6 +1355,8 @@ static int multipath_message(struct dm_t
 		return bypass_pg_num(m, argv[1], 0);
 	else if (!strnicmp(argv[0], MESG_STR("switch_group")))
 		return switch_pg_num(m, argv[1]);
+	else if (!strnicmp(argv[0], MESG_STR("set_pg_timeout")))
+		return set_pg_timeout(m, argv[1]);
 	else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
 		action = reinstate_path;
 	else if (!strnicmp(argv[0], MESG_STR("fail_path")))