From: William Lee Irwin III <wli@holomorphy.com>

Move the slow paths of wait_on_bit() and wait_on_bit_lock() out of line. 
Also uninline wake_up_bit() to reduce the number of callsites generated,
and adjust loop startup in __wait_on_bit_lock() to properly reflect its
usage in the contention case.

Incremental atop the fastcall and wait_on_bit_lock()/test_and_set_bit()
fixes.  Successfully tested on x86-64.

Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/include/linux/wait.h |   32 +++++---------------------------
 25-akpm/kernel/wait.c        |   40 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 29 deletions(-)

diff -puN include/linux/wait.h~move-wait-ops-contention-case-completely-out-of-line include/linux/wait.h
--- 25/include/linux/wait.h~move-wait-ops-contention-case-completely-out-of-line	Tue Aug 31 14:45:08 2004
+++ 25-akpm/include/linux/wait.h	Tue Aug 31 14:45:08 2004
@@ -142,23 +142,11 @@ extern void FASTCALL(__wake_up_sync(wait
 void FASTCALL(__wake_up_bit(wait_queue_head_t *, void *, int));
 int FASTCALL(__wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, void *, int, int (*)(void *), unsigned));
 int FASTCALL(__wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, void *, int, int (*)(void *), unsigned));
+void FASTCALL(wake_up_bit(void *, int));
+int FASTCALL(out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned));
+int FASTCALL(out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned));
 wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int));
 
-/**
- * wake_up_bit - wake up a waiter on a bit
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that wakes up waiters
- * on a bit. For instance, if one were to have waiters on a bitflag,
- * one would call wake_up_bit() after clearing the bit.
- */
-static inline void wake_up_bit(void *word, int bit)
-{
-	__wake_up_bit(bit_waitqueue(word, bit), word, bit);
-}
-
 #define wake_up(x)			__wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr, NULL)
 #define wake_up_all(x)			__wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, NULL)
@@ -356,14 +344,9 @@ int wake_bit_function(wait_queue_t *wait
 static inline int wait_on_bit(void *word, int bit,
 				int (*action)(void *), unsigned mode)
 {
-	DEFINE_WAIT_BIT(q, word, bit);
-	wait_queue_head_t *wqh;
-
 	if (!test_bit(bit, word))
 		return 0;
-
-	wqh = bit_waitqueue(word, bit);
-	return __wait_on_bit(wqh, &q, word, bit, action, mode);
+	return out_of_line_wait_on_bit(word, bit, action, mode);
 }
 
 /**
@@ -385,14 +368,9 @@ static inline int wait_on_bit(void *word
 static inline int wait_on_bit_lock(void *word, int bit,
 				int (*action)(void *), unsigned mode)
 {
-	DEFINE_WAIT_BIT(q, word, bit);
-	wait_queue_head_t *wqh;
-
 	if (!test_and_set_bit(bit, word))
 		return 0;
-
-	wqh = bit_waitqueue(word, bit);
-	return __wait_on_bit_lock(wqh, &q, word, bit, action, mode);
+	return out_of_line_wait_on_bit_lock(word, bit, action, mode);
 }
 	
 #endif /* __KERNEL__ */
diff -puN kernel/wait.c~move-wait-ops-contention-case-completely-out-of-line kernel/wait.c
--- 25/kernel/wait.c~move-wait-ops-contention-case-completely-out-of-line	Tue Aug 31 14:45:08 2004
+++ 25-akpm/kernel/wait.c	Tue Aug 31 14:47:04 2004
@@ -164,24 +164,44 @@ __wait_on_bit(wait_queue_head_t *wq, str
 }
 EXPORT_SYMBOL(__wait_on_bit);
 
+int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
+					int (*action)(void *), unsigned mode)
+{
+	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wait, word, bit);
+
+	return __wait_on_bit(wq, &wait, word, bit, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
 int __sched fastcall
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
 		void *word, int bit, int (*action)(void *), unsigned mode)
 {
 	int ret = 0;
 
-	while (test_and_set_bit(bit, word)) {
+	do {
 		prepare_to_wait_exclusive(wq, &q->wait, mode);
 		if (test_bit(bit, word)) {
 			if ((ret = (*action)(word)))
 				break;
 		}
-	}
+	} while (test_and_set_bit(bit, word));
 	finish_wait(wq, &q->wait);
 	return ret;
 }
 EXPORT_SYMBOL(__wait_on_bit_lock);
 
+int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
+					int (*action)(void *), unsigned mode)
+{
+	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wait, word, bit);
+
+	return __wait_on_bit_lock(wq, &wait, word, bit, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
 void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
 {
 	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
@@ -190,6 +210,22 @@ void fastcall __wake_up_bit(wait_queue_h
 }
 EXPORT_SYMBOL(__wake_up_bit);
 
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ */
+void fastcall wake_up_bit(void *word, int bit)
+{
+	__wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
 fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit)
 {
 	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
_