From: David Mosberger <davidm@napali.hpl.hp.com>

Somebody recently pointed out a performance-anomaly to me where an unusual
amount of time was being spent reading from /dev/urandom.  The problem
isn't really surprising as it happened only on >= 4-way machines and the
random driver isn't terribly scalable the way it is written today.  If
scalability _really_ mattered, I suppose per-CPU data structures would be
the way to go.  However, I found that at least for 4-way machines,
performance can be improved considerably with the attached patch.  In
particular, I saw the following performance on a 4-way ia64 machine:

Test: 3 tasks running "dd if=/dev/urandom of=/dev/null bs=1024":

			throughput:
			-----------
 original driver:	2.2 GB/sec
 patched driver:	2.3 GB/sec

Test: 4 tasks running "dd if=/dev/urandom of=/dev/null bs=1024":

			throughput:
			-----------
 original driver:	0.4 GB/sec
 patched driver:	1.9 GB/sec

In words: a slight improvement when there is little lock contention and a
huge improvement with significant lock-contention.

One reason for the scalability improvement comes from the reorganization of
"struct entropy_store".  Basically, the patch separates read-only data from
read-write data.  I also tried putting the spinlock in its own cacheline,
but that reduced performance slightly.  My theory is that co-locating the
other read-write data with the lock improves overall throughput at the cost
of some extra bus traffic (every time any read-write data is updated, the
other CPUs spinning on the lock will re-fetch the data, even though the
lock will remain taken).

The other reason for the scalability improvement is the prefetching of the
pool[] data, because the underlying cache-lines almost certainly will have
been dirtied by the other CPUs, so they'll miss in the cache.

The rest of the patch is fairly obvious streamlining of the code (such as
read-ahead of the input data etc.).  I didn't measure the impact of these
separately, but since it makes live easier for the compiler, it should help
most, if not all platforms.


---

 25-akpm/drivers/char/random.c    |   51 ++++++++++++++++++++++++++-------------
 25-akpm/include/linux/prefetch.h |   12 +++++++++
 2 files changed, 47 insertions(+), 16 deletions(-)

diff -puN drivers/char/random.c~urandom-scalability-fix drivers/char/random.c
--- 25/drivers/char/random.c~urandom-scalability-fix	2004-03-28 13:43:40.745764704 -0800
+++ 25-akpm/drivers/char/random.c	2004-03-28 13:43:40.750763944 -0800
@@ -490,12 +490,15 @@ static inline __u32 int_ln_12bits(__u32 
  **********************************************************************/
 
 struct entropy_store {
+	/* mostly-read data: */
+	struct poolinfo poolinfo;
+	__u32		*pool;
+
+	/* read-write data: */
+	spinlock_t lock ____cacheline_aligned_in_smp;
 	unsigned	add_ptr;
 	int		entropy_count;
 	int		input_rotate;
-	struct poolinfo poolinfo;
-	__u32		*pool;
-	spinlock_t lock;
 };
 
 /*
@@ -571,38 +574,54 @@ static void add_entropy_words(struct ent
 	static __u32 const twist_table[8] = {
 		         0, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
 		0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
-	unsigned i;
-	int new_rotate;
+	unsigned long i, add_ptr, tap1, tap2, tap3, tap4, tap5;
+	int new_rotate, input_rotate;
 	int wordmask = r->poolinfo.poolwords - 1;
-	__u32 w;
+	__u32 w, next_w;
 	unsigned long flags;
 
+	/* Taps are constant, so we can load them without holding r->lock.  */
+	tap1 = r->poolinfo.tap1;
+	tap2 = r->poolinfo.tap2;
+	tap3 = r->poolinfo.tap3;
+	tap4 = r->poolinfo.tap4;
+	tap5 = r->poolinfo.tap5;
+	next_w = *in++;
+
 	spin_lock_irqsave(&r->lock, flags);
+	prefetch_range(r->pool, wordmask);
+	input_rotate = r->input_rotate;
+	add_ptr = r->add_ptr;
 
 	while (nwords--) {
-		w = rotate_left(r->input_rotate, *in++);
-		i = r->add_ptr = (r->add_ptr - 1) & wordmask;
+		w = rotate_left(input_rotate, next_w);
+		if (nwords > 0)
+			next_w = *in++;
+		i = add_ptr = (add_ptr - 1) & wordmask;
 		/*
 		 * Normally, we add 7 bits of rotation to the pool.
 		 * At the beginning of the pool, add an extra 7 bits
 		 * rotation, so that successive passes spread the
 		 * input bits across the pool evenly.
 		 */
-		new_rotate = r->input_rotate + 14;
+		new_rotate = input_rotate + 14;
 		if (i)
-			new_rotate = r->input_rotate + 7;
-		r->input_rotate = new_rotate & 31;
+			new_rotate = input_rotate + 7;
+		input_rotate = new_rotate & 31;
 
 		/* XOR in the various taps */
-		w ^= r->pool[(i + r->poolinfo.tap1) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap2) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap3) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap4) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap5) & wordmask];
+		w ^= r->pool[(i + tap1) & wordmask];
+		w ^= r->pool[(i + tap2) & wordmask];
+		w ^= r->pool[(i + tap3) & wordmask];
+		w ^= r->pool[(i + tap4) & wordmask];
+		w ^= r->pool[(i + tap5) & wordmask];
 		w ^= r->pool[i];
 		r->pool[i] = (w >> 3) ^ twist_table[w & 7];
 	}
 
+	r->input_rotate = input_rotate;
+	r->add_ptr = add_ptr;
+
 	spin_unlock_irqrestore(&r->lock, flags);
 }
 
diff -puN include/linux/prefetch.h~urandom-scalability-fix include/linux/prefetch.h
--- 25/include/linux/prefetch.h~urandom-scalability-fix	2004-03-28 13:43:40.746764552 -0800
+++ 25-akpm/include/linux/prefetch.h	2004-03-28 15:12:25.559270008 -0800
@@ -10,6 +10,7 @@
 #ifndef _LINUX_PREFETCH_H
 #define _LINUX_PREFETCH_H
 
+#include <linux/types.h>
 #include <asm/processor.h>
 #include <asm/cache.h>
 
@@ -54,4 +55,15 @@ static inline void prefetchw(const void 
 #define PREFETCH_STRIDE (4*L1_CACHE_BYTES)
 #endif
 
+static inline void prefetch_range(void *addr, size_t len)
+{
+#ifdef ARCH_HAS_PREFETCH
+	char *cp;
+	char *end = addr + len;
+
+	for (cp = addr; cp < end; cp += PREFETCH_STRIDE)
+		prefetch(cp);
+#endif
+}
+
 #endif

_