diff -urN 2.4.0-test1-ac7/arch/i386/mm/init.c 2.4.0-test1-ac7-VM-31/arch/i386/mm/init.c
--- 2.4.0-test1-ac7/arch/i386/mm/init.c	Fri May 26 22:46:46 2000
+++ 2.4.0-test1-ac7-VM-31/arch/i386/mm/init.c	Sat Jun  3 14:53:14 2000
@@ -608,7 +608,7 @@
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		max_mapnr << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
diff -urN 2.4.0-test1-ac7/fs/buffer.c 2.4.0-test1-ac7-VM-31/fs/buffer.c
--- 2.4.0-test1-ac7/fs/buffer.c	Sat Jun  3 14:52:33 2000
+++ 2.4.0-test1-ac7-VM-31/fs/buffer.c	Sat Jun  3 15:17:06 2000
@@ -495,17 +495,6 @@
 	__remove_from_lru_list(bh, bh->b_list);
 }
 
-static void insert_into_queues(struct buffer_head *bh)
-{
-	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
-
-	spin_lock(&lru_list_lock);
-	write_lock(&hash_table_lock);
-	__hash_link(bh, head);
-	__insert_into_lru_list(bh, bh->b_list);
-	write_unlock(&hash_table_lock);
-	spin_unlock(&lru_list_lock);
-}
 
 /* This function must only run if there are no other
  * references _anywhere_ to this buffer head.
@@ -537,12 +526,11 @@
  * will force it bad). This shouldn't really happen currently, but
  * the code is ready.
  */
-struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
+static struct buffer_head * __get_hash_table(kdev_t dev, int block, int size,
+						struct buffer_head **head)
 {
-	struct buffer_head **head = &hash(dev, block);
 	struct buffer_head *bh;
 
-	read_lock(&hash_table_lock);
 	for(bh = *head; bh; bh = bh->b_next)
 		if (bh->b_blocknr == block	&&
 		    bh->b_size    == size	&&
@@ -550,11 +538,45 @@
 			break;
 	if (bh)
 		atomic_inc(&bh->b_count);
+
+	return bh;
+}
+
+struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
+{
+	struct buffer_head **head = &hash(dev, block);
+	struct buffer_head *bh;
+
+	read_lock(&hash_table_lock);
+	bh = __get_hash_table(dev, block, size, head);
 	read_unlock(&hash_table_lock);
 
 	return bh;
 }
 
+static int insert_into_queues_unique(struct buffer_head *bh)
+{
+	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
+	struct buffer_head *alias;
+	int err = 0;
+
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+
+	alias = __get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size, head);
+	err = 1;
+	if (!alias) {
+		__hash_link(bh, head);
+		__insert_into_lru_list(bh, bh->b_list);
+		err = 0;
+	}
+
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+
+	return err;
+}
+
 unsigned int get_hardblocksize(kdev_t dev)
 {
 	/*
@@ -713,6 +735,7 @@
 static void refill_freelist(int size)
 {
 	if (!grow_buffers(size)) {
+		current->state = TASK_RUNNING;
 		wakeup_bdflush(1);
 		current->policy |= SCHED_YIELD;
 		schedule();
@@ -841,8 +864,16 @@
 		bh->b_blocknr = block;
 		bh->b_state = 1 << BH_Mapped;
 
-		/* Insert the buffer into the regular lists */
-		insert_into_queues(bh);
+		/* Insert the buffer into the regular lists; check noone
+		   else added it first */
+		
+		if (!insert_into_queues_unique(bh))
+			goto out;
+
+		/* someone added it after we last check the hash table */
+		put_last_free(bh);
+		goto repeat;
+	
 	out:
 		touch_buffer(bh);
 		return bh;
@@ -858,7 +889,7 @@
 
 /* -1 -> no need to flush
     0 -> async flush
-    1 -> sync flush (wait for I/O completation) */
+    1 -> sync flush (wait for I/O completion) */
 static int balance_dirty_state(kdev_t dev)
 {
 	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
@@ -1235,7 +1266,7 @@
 	goto try_again;
 }
 
-static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
+static void create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
 {
 	struct buffer_head *head, *bh, *tail;
 	int block;
@@ -1247,6 +1278,7 @@
 	 * They don't show up in the buffer hash table, but they *are*
 	 * registered in page->buffers.
 	 */
+	/* FIXME: create_buffers should fail if there's no enough memory */
 	head = create_buffers(page, size, 1);
 	if (page->buffers)
 		BUG();
@@ -1266,7 +1298,7 @@
 	tail->b_this_page = head;
 	page_cache_get(page);
 	page->buffers = head;
-	return 0;
+	lru_cache_buf(page, LRU_SWAP_CACHE);
 }
 
 static void unmap_buffer(struct buffer_head * bh)
@@ -1289,44 +1321,33 @@
  * any IO, we are not interested in the contents of the buffer.  This
  * function can block if the buffer is locked.
  */
-static struct buffer_head *discard_buffer(struct buffer_head * bh)
+static inline struct buffer_head *discard_buffer(struct buffer_head * bh)
 {
-	int index = BUFSIZE_INDEX(bh->b_size);
 	struct buffer_head *next;
 
-	/* grab the lru lock here to block bdflush. */
-	atomic_inc(&bh->b_count);
-	lock_buffer(bh);
+	if (bh->b_dev == B_FREE)
+		BUG();
+
 	next = bh->b_this_page;
-	clear_bit(BH_Uptodate, &bh->b_state);
-	clear_bit(BH_Mapped, &bh->b_state);
-	clear_bit(BH_Req, &bh->b_state);
-	clear_bit(BH_New, &bh->b_state);
+
+	unmap_buffer(bh);
 
 	spin_lock(&lru_list_lock);
 	write_lock(&hash_table_lock);
-	spin_lock(&free_list[index].lock);
 	spin_lock(&unused_list_lock);
 
-	if (!atomic_dec_and_test(&bh->b_count))
+	if (atomic_read(&bh->b_count))
 		BUG();
 
 	__hash_unlink(bh);
-	/* The bunffer can be either on the regular
-	 * queues or on the free list..
-	 */
-	if (bh->b_dev != B_FREE)
-		__remove_from_queues(bh);
-	else
-		__remove_from_free_list(bh, index);
-	__put_unused_buffer_head(bh);	
-	spin_unlock(&unused_list_lock);
 	write_unlock(&hash_table_lock);
-	spin_unlock(&free_list[index].lock);
+
+	__remove_from_queues(bh);
 	spin_unlock(&lru_list_lock);
-	/* We can unlock the buffer, we have just returned it.
-	 * Ditto for the counter 
-         */
+
+	__put_unused_buffer_head(bh);	
+	spin_unlock(&unused_list_lock);
+
 	return next;
 }
 
@@ -1400,6 +1421,7 @@
 	/* And free the page */
 	page->buffers = NULL;
 	page_cache_release(page);
+	lru_cache_unbuf(page);
 }
 
 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
@@ -1421,6 +1443,7 @@
 	tail->b_this_page = head;
 	page->buffers = head;
 	page_cache_get(page);
+	lru_cache_buf(page, LRU_NORMAL_CACHE);
 }
 
 static void unmap_underlying_metadata(struct buffer_head * bh)
@@ -1868,6 +1891,7 @@
 	}
 	
 	spin_unlock(&unused_list_lock);
+	wake_up(&buffer_wait);
 
 	return iosize;
 }
@@ -2004,6 +2028,8 @@
 		__put_unused_buffer_head(bh[bhind]);
 	}
 	spin_unlock(&unused_list_lock);
+	wake_up(&buffer_wait);
+
 	goto finished;
 }
 
@@ -2170,7 +2196,8 @@
 	spin_unlock(&free_list[isize].lock);
 
 	page->buffers = bh;
-	lru_cache_add(page);
+	page->flags &= ~(1 << PG_referenced);
+	lru_cache_add(page, LRU_NORMAL_CACHE);
 	atomic_inc(&buffermem_pages);
 	return 1;
 
@@ -2181,35 +2208,29 @@
 }
 
 /*
- * Sync all the buffers on one page..
- *
- * If we have old buffers that are locked, we'll
- * wait on them, but we won't wait on the new ones
- * we're writing out now.
- *
- * This all is required so that we can free up memory
- * later.
+ * Can the buffer be thrown out?
  */
-static void sync_page_buffers(struct buffer_head *bh, int wait)
+#define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
+#define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
+
+static int sync_page_buffers(struct buffer_head * bh)
 {
 	struct buffer_head * tmp = bh;
 
 	do {
-		struct buffer_head *p = tmp;
+		if (buffer_dirty(tmp) && !buffer_locked(tmp))
+			ll_rw_block(WRITE, 1, &tmp);
 		tmp = tmp->b_this_page;
-		if (buffer_locked(p)) {
-			if (wait)
-				__wait_on_buffer(p);
-		} else if (buffer_dirty(p))
-			ll_rw_block(WRITE, 1, &p);
 	} while (tmp != bh);
-}
 
-/*
- * Can the buffer be thrown out?
- */
-#define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
-#define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
+	do {
+		if (buffer_busy(tmp))
+			return 1;
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+
+	return 0;
+}
 
 /*
  * try_to_free_buffers() checks if all the buffers on this particular page
@@ -2222,21 +2243,20 @@
  *       obtain a reference to a buffer head within a page.  So we must
  *	 lock out all of these paths to cleanly toss the page.
  */
-int try_to_free_buffers(struct page * page, int wait)
+int try_to_free_buffers(struct page * page)
 {
 	struct buffer_head * tmp, * bh = page->buffers;
 	int index = BUFSIZE_INDEX(bh->b_size);
 
+ again:
 	spin_lock(&lru_list_lock);
 	write_lock(&hash_table_lock);
 	spin_lock(&free_list[index].lock);
 	tmp = bh;
 	do {
-		struct buffer_head *p = tmp;
-
-		tmp = tmp->b_this_page;
-		if (buffer_busy(p))
+		if (buffer_busy(tmp))
 			goto busy_buffer_page;
+		tmp = tmp->b_this_page;
 	} while (tmp != bh);
 
 	spin_lock(&unused_list_lock);
@@ -2272,7 +2292,8 @@
 	spin_unlock(&free_list[index].lock);
 	write_unlock(&hash_table_lock);
 	spin_unlock(&lru_list_lock);	
-	sync_page_buffers(bh, wait);
+	if (!sync_page_buffers(bh))
+		goto again;
 	return 0;
 }
 
diff -urN 2.4.0-test1-ac7/fs/dcache.c 2.4.0-test1-ac7-VM-31/fs/dcache.c
--- 2.4.0-test1-ac7/fs/dcache.c	Fri May 26 22:47:00 2000
+++ 2.4.0-test1-ac7-VM-31/fs/dcache.c	Sat Jun  3 14:53:14 2000
@@ -512,21 +512,22 @@
  *  ...
  *   6 - base-level: try to shrink a bit.
  */
-int shrink_dcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dcache_memory(int priority, unsigned int gfp_mask, zone_t * zone)
 {
-	int count = 0;
-	lock_kernel();
-	if (priority)
-		count = dentry_stat.nr_unused / priority;
-	prune_dcache(count);
-	unlock_kernel();
-	/* FIXME: kmem_cache_shrink here should tell us
-	   the number of pages freed, and it should
-	   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
-	   to free only the interesting pages in
-	   function of the needs of the current allocation. */
-	kmem_cache_shrink(dentry_cache);
-
+	if (gfp_mask & __GFP_IO) {
+		int count = 0;
+		lock_kernel();
+		if (priority)
+			count = dentry_stat.nr_unused / priority;
+		prune_dcache(count);
+		unlock_kernel();
+		/* FIXME: kmem_cache_shrink here should tell us
+		   the number of pages freed, and it should
+		   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
+		   to free only the interesting pages in
+		   function of the needs of the current allocation. */
+		kmem_cache_shrink(dentry_cache);
+	}
 	return 0;
 }
 
diff -urN 2.4.0-test1-ac7/fs/exec.c 2.4.0-test1-ac7-VM-31/fs/exec.c
--- 2.4.0-test1-ac7/fs/exec.c	Sat Jun  3 14:52:34 2000
+++ 2.4.0-test1-ac7-VM-31/fs/exec.c	Sat Jun  3 14:53:14 2000
@@ -266,6 +266,7 @@
 		return;
 	}
 	flush_page_to_ram(page);
+	page_anon_init_map_wmb(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
 /* no need for flush_tlb */
 }
@@ -308,6 +309,7 @@
 		if (bprm->page[i]) {
 			current->mm->rss++;
 			put_dirty_page(current,bprm->page[i],stack_base);
+			bprm->page[i] = NULL;
 		}
 		stack_base += PAGE_SIZE;
 	}
@@ -860,9 +862,11 @@
 
 	/* Assumes that free_page() can take a NULL argument. */ 
 	/* I hope this is ok for all architectures */ 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++)
-		if (bprm.page[i])
-			__free_page(bprm.page[i]);
+	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
+		struct page * page = bprm.page[i];
+		if (page)
+			__free_page(page);
+	}
 
 	return retval;
 }
diff -urN 2.4.0-test1-ac7/fs/inode.c 2.4.0-test1-ac7-VM-31/fs/inode.c
--- 2.4.0-test1-ac7/fs/inode.c	Sat Jun  3 14:52:34 2000
+++ 2.4.0-test1-ac7-VM-31/fs/inode.c	Sat Jun  3 14:53:14 2000
@@ -450,20 +450,21 @@
 	dispose_list(freeable);
 }
 
-int shrink_icache_memory(int priority, int gfp_mask)
+int shrink_icache_memory(int priority, int gfp_mask, zone_t *zone)
 {
-	int count = 0;
+	if (gfp_mask & __GFP_IO) {
+		int count = 0;
 		
-	if (priority)
-		count = inodes_stat.nr_unused / priority;
-	prune_icache(count);
-	/* FIXME: kmem_cache_shrink here should tell us
-	   the number of pages freed, and it should
-	   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
-	   to free only the interesting pages in
-	   function of the needs of the current allocation. */
-	kmem_cache_shrink(inode_cachep);
-
+		if (priority)
+			count = inodes_stat.nr_unused / priority;
+		prune_icache(count);
+		/* FIXME: kmem_cache_shrink here should tell us
+		   the number of pages freed, and it should
+		   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
+		   to free only the interesting pages in
+		   function of the needs of the current allocation. */
+		kmem_cache_shrink(inode_cachep);
+	}
 	return 0;
 }
 
diff -urN 2.4.0-test1-ac7/include/linux/cache.h 2.4.0-test1-ac7-VM-31/include/linux/cache.h
--- 2.4.0-test1-ac7/include/linux/cache.h	Sun May 28 20:52:34 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/cache.h	Sat Jun  3 15:45:16 2000
@@ -1,6 +1,7 @@
 #ifndef __LINUX_CACHE_H
 #define __LINUX_CACHE_H
 
+#include <linux/config.h>
 #include <asm/cache.h>
 
 #ifndef L1_CACHE_ALIGN
@@ -13,6 +14,14 @@
 
 #ifndef ____cacheline_aligned
 #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
+#endif
+
+#ifndef ____cacheline_aligned_in_smp
+#ifdef CONFIG_SMP
+#define ____cacheline_aligned_in_smp ____cacheline_aligned
+#else
+#define ____cacheline_aligned_in_smp
+#endif /* CONFIG_SMP */
 #endif
 
 #ifndef __cacheline_aligned
diff -urN 2.4.0-test1-ac7/include/linux/dcache.h 2.4.0-test1-ac7-VM-31/include/linux/dcache.h
--- 2.4.0-test1-ac7/include/linux/dcache.h	Sun May 28 20:52:34 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/dcache.h	Sat Jun  3 15:45:15 2000
@@ -150,11 +150,11 @@
 #define shrink_dcache() prune_dcache(0)
 struct zone_struct;
 /* dcache memory management */
-extern int shrink_dcache_memory(int, unsigned int);
+extern int shrink_dcache_memory(int, unsigned int, struct zone_struct *);
 extern void prune_dcache(int);
 
 /* icache memory management (defined in linux/fs/inode.c) */
-extern int shrink_icache_memory(int, int);
+extern int shrink_icache_memory(int, int, struct zone_struct *);
 extern void prune_icache(int);
 
 /* only used at mount-time */
diff -urN 2.4.0-test1-ac7/include/linux/fs.h 2.4.0-test1-ac7-VM-31/include/linux/fs.h
--- 2.4.0-test1-ac7/include/linux/fs.h	Sat Jun  3 14:52:34 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/fs.h	Sat Jun  3 15:45:16 2000
@@ -910,7 +910,7 @@
 
 extern int fs_may_remount_ro(struct super_block *);
 
-extern int try_to_free_buffers(struct page *, int);
+extern int try_to_free_buffers(struct page *);
 extern void refile_buffer(struct buffer_head * buf);
 
 #define BUF_CLEAN	0
diff -urN 2.4.0-test1-ac7/include/linux/highmem.h 2.4.0-test1-ac7-VM-31/include/linux/highmem.h
--- 2.4.0-test1-ac7/include/linux/highmem.h	Sun May 28 20:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/highmem.h	Sat Jun  3 15:45:16 2000
@@ -12,7 +12,7 @@
 
 /* declarations for linux/mm/highmem.c */
 extern unsigned long highmem_mapnr;
-FASTCALL(unsigned int nr_free_highpages(void));
+extern unsigned long nr_free_highpages(void);
 
 extern struct page * prepare_highmem_swapout(struct page *);
 extern struct page * replace_with_highmem(struct page *);
@@ -20,7 +20,7 @@
 
 #else /* CONFIG_HIGHMEM */
 
-extern inline unsigned int nr_free_highpages(void) { return 0; }
+#define nr_free_highpages() 0UL
 #define prepare_highmem_swapout(page) page
 #define replace_with_highmem(page) page
 #define kmap(page) page_address(page)
diff -urN 2.4.0-test1-ac7/include/linux/mm.h 2.4.0-test1-ac7-VM-31/include/linux/mm.h
--- 2.4.0-test1-ac7/include/linux/mm.h	Sat Jun  3 14:52:34 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/mm.h	Sat Jun  3 15:52:27 2000
@@ -15,7 +15,6 @@
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
-extern struct list_head lru_cache;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -146,6 +145,7 @@
 	unsigned long index;
 	struct page *next_hash;
 	atomic_t count;
+	int map_count;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct list_head lru;
 	wait_queue_head_t wait;
@@ -153,11 +153,11 @@
 	struct buffer_head * buffers;
 	unsigned long virtual; /* nonzero if kmapped */
 	struct zone_struct *zone;
-	unsigned int age;
 } mem_map_t;
 
 #define get_page(p)		atomic_inc(&(p)->count)
 #define put_page(p)		__free_page(p)
+#define put_page_raw(p)		atomic_dec(&(p)->count)
 #define put_page_testzero(p) 	atomic_dec_and_test(&(p)->count)
 #define page_count(p)		atomic_read(&(p)->count)
 #define set_page_count(p,v) 	atomic_set(&(p)->count, v)
@@ -169,8 +169,8 @@
 #define PG_uptodate		 3
 #define PG_dirty		 4
 #define PG_decr_after		 5
-#define PG_unused_01		 6
-#define PG_active		 7
+#define PG_out_lru		 6
+#define PG__unused_02		 7
 #define PG_slab			 8
 #define PG_swap_cache		 9
 #define PG_skip			10
@@ -194,9 +194,6 @@
 					clear_bit(PG_locked, &(page)->flags); \
 					wake_up(&page->wait); \
 				} while (0)
-#define PageActive(page)	test_bit(PG_active, &(page)->flags)
-#define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
-#define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
 #define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
@@ -229,6 +226,12 @@
 #define SetPageReserved(page)		set_bit(PG_reserved, &(page)->flags)
 #define ClearPageReserved(page)		clear_bit(PG_reserved, &(page)->flags)
 
+#define PageSetOutLru(page)		set_bit(PG_out_lru, &(page)->flags)
+#define PageClearOutLru(page)		clear_bit(PG_out_lru, &(page)->flags)
+#define PageTestandSetOutLru(page)	test_and_set_bit(PG_out_lru, &(page)->flags)
+#define PageTestandClearOutLru(page)	test_and_clear_bit(PG_out_lru, &(page)->flags)
+#define PageOutLru(page)			test_bit(PG_out_lru, &(page)->flags)
+
 /*
  * Error return values for the *_nopage functions
  */
@@ -314,21 +317,21 @@
  * can allocate highmem pages, the *get*page*() variants return
  * virtual kernel addresses to the allocated page(s).
  */
-extern struct page * FASTCALL(__alloc_pages(zonelist_t *zonelist, unsigned long order));
+extern struct page * FASTCALL(__alloc_pages(gfpmask_zone_t *, unsigned long order));
 extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order);
 
 #ifndef CONFIG_DISCONTIGMEM
-static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
+extern inline struct page * alloc_pages(int gfp_mask, unsigned long order)
 {
 	/*  temporary check. */
-	if (contig_page_data.node_zonelists[gfp_mask].gfp_mask != (gfp_mask))
+	if (contig_page_data.node_gfpmask_zone[gfp_mask].gfp_mask != (gfp_mask))
 		BUG();
 	/*
 	 * Gets optimized away by the compiler.
 	 */
 	if (order >= MAX_ORDER)
 		return NULL;
-	return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);
+	return __alloc_pages(contig_page_data.node_gfpmask_zone+gfp_mask, order);
 }
 #else /* !CONFIG_DISCONTIGMEM */
 extern struct page * alloc_pages(int gfp_mask, unsigned long order);
@@ -460,9 +463,9 @@
 /* filemap.c */
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int shrink_mmap(int, zone_t *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
-extern void truncate_all_inode_pages(struct address_space *);
+#define truncate_all_inode_pages(x) truncate_inode_pages(x, 0)
 
 /* generic vm_area_ops exported for stackable file systems */
 extern int filemap_swapout(struct page * page, struct file *file);
@@ -491,7 +494,7 @@
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_USER	(__GFP_WAIT | __GFP_IO)
 #define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
-#define GFP_KERNEL	(__GFP_HIGH | __GFP_WAIT | __GFP_IO)
+#define GFP_KERNEL	(__GFP_WAIT | __GFP_IO)
 #define GFP_NFS		(__GFP_HIGH | __GFP_WAIT | __GFP_IO)
 #define GFP_KSWAPD	(__GFP_IO)
 
@@ -542,15 +545,183 @@
 
 extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
 
-#define buffer_under_min()	(atomic_read(&buffermem_pages) * 100 < \
-				buffer_mem.min_percent * num_physpages)
-#define pgcache_under_min()	(atomic_read(&page_cache_size) * 100 < \
-				page_cache.min_percent * num_physpages)
+#define lru_cache_under_min(lru_pages)	((lru_pages) * 100 < \
+					 lru_cache_mem.min_percent * num_physpages)
 
 #define vmlist_access_lock(mm)		spin_lock(&mm->page_table_lock)
 #define vmlist_access_unlock(mm)	spin_unlock(&mm->page_table_lock)
 #define vmlist_modify_lock(mm)		vmlist_access_lock(mm)
 #define vmlist_modify_unlock(mm)	vmlist_access_unlock(mm)
+
+/*
+ * Helper macros for lru_cache handling.
+ */
+
+/*
+ * lru_cache_add can be run on a page that is just mapped
+ * (this precisely will always happen when adding an anonymous page
+ * shared by lots of read-only users after a COW to the swap cache).
+ * In the current VM design it never happens that buffer cache page
+ * is mapped or that we insert in the lru a page that has still I/O
+ * buffers on it.
+ */
+#define	lru_cache_add(page, lru_type)					\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if ((page)->map_count < 0 ||					\
+	    (!(page)->mapping && !(page)->buffers) ||			\
+	    ((page)->map_count && (page)->buffers) ||			\
+	    PageOutLru(page))						\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	if (!(page)->map_count) {					\
+		list_add(&(page)->lru, &this_lru->heads[lru_type]);	\
+		this_lru->nr_cache_pages++;				\
+	} else								\
+		this_lru->nr_map_pages++;				\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
+
+/*
+ * Only the pages without overlapped buffer headers can be removed from the
+ * lru or we won't be able to reclaim the buffer header memory anymore and
+ * we would make them unfreeable.
+ *
+ * Locking: if the page isn't under shrink mmap processing
+ * then page->buffers can't change from under us. If the the page is under
+ * shrink_mmap processing, then we can simply update the map count and
+ * shrink_mmap will finish the work later. With this locking protocol we
+ * avoid getting the lock while mapping pages.
+ */
+#define	lru_cache_map(page)						\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if ((page)->map_count < 0 || !(page)->mapping)			\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	if (!(page)->map_count++ && !(page)->buffers &&			\
+	    !PageOutLru(page)) {					\
+		list_del(&(page)->lru);					\
+		this_lru->nr_cache_pages--;				\
+		this_lru->nr_map_pages++;				\
+	}								\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
+
+/* no need of any lock, at this time the page is still local to us */
+#define	page_anon_init_map_wmb(page)		\
+do {						\
+	if ((page)->map_count)			\
+		BUG();				\
+	set_wmb((page)->map_count, 1);		\
+} while (0)
+
+/* dup the mapping for a page known to be just referenced as mapped */
+#define	page_map(page)							\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if ((page)->map_count <= 0)					\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	(page)->map_count++;						\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
+
+/* drop a map reference from an _anonymoys_ page mapped in our current MM */
+#define	page_unmap(page)						\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if ((page)->mapping || (page)->map_count <= 0)			\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	(page)->map_count--;						\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
+
+/*
+ * Don't discriminate between lru and put all pages that were mapped
+ * in the normal lru.
+ *
+ * This function can be called also on all kind of pages provided they
+ * were accounted in the mapping logic.
+ *
+ * Locking: if the page isn't under shrink mmap processing
+ * then page->buffers can't change from under us. It may be possible
+ * we did both the map and the unmap while the page was out of the lru,
+ * shrink_mmap will get this case right too later.
+ */
+#define	lru_cache_unmap(page, lru_type)					    \
+do {									    \
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	    \
+	if ((page)->map_count <= 0)					    \
+		BUG();							    \
+	spin_lock(&this_lru->lock);					    \
+	if (!--(page)->map_count && (page)->mapping &&			    \
+	    !(page)->buffers && !PageOutLru(page)) {			    \
+		list_add(&(page)->lru, &this_lru->heads[lru_type]);	    \
+		this_lru->nr_cache_pages++;				    \
+		this_lru->nr_map_pages--;				    \
+	}								    \
+	spin_unlock(&this_lru->lock);					    \
+} while (0)
+
+/*
+ * This puts the page in the lru in case it was out of the lru since
+ * we overlapped some buffer head on the page.
+ * We hold the lock the per-page lock here.
+ */
+#define lru_cache_buf(page, lru_type)					    \
+do {									    \
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	    \
+	if (!(page)->mapping || !(page)->buffers || PageOutLru(page))	    \
+		BUG();							    \
+	spin_lock(&this_lru->lock);					    \
+	if ((page)->map_count) {					    \
+		list_add(&(page)->lru, &this_lru->heads[lru_type]);	    \
+		this_lru->nr_cache_pages++;				    \
+		this_lru->nr_map_pages--;				    \
+	}								    \
+	spin_unlock(&this_lru->lock);					    \
+} while (0)
+
+/*
+ * This is called when we drop the buffer headers from the page. We must
+ * remove the page from the lru if it wasn't mapped. We hold the per-page
+ * lock here.
+ */
+#define lru_cache_unbuf(page)						\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if (!(page)->mapping || (page)->buffers || PageOutLru(page))	\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	if ((page)->map_count) {					\
+		list_del(&(page)->lru);					\
+		this_lru->nr_cache_pages--;				\
+		this_lru->nr_map_pages++;				\
+	}								\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
+
+/*
+ * Needs the lock on the page to be sure the page is in the lru list.
+ * swapoff is the only caller that can drop a mapped cache from the lru
+ * in order to do the swap-cache-page to anonymous-page conversion.
+ */
+#define	lru_cache_del(page)						\
+do {									\
+	lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache;	\
+	if (!PageLocked(page) || (page)->map_count < 0 ||		\
+	    !(page)->mapping || (page)->buffers || PageOutLru(page))	\
+		BUG();							\
+	spin_lock(&this_lru->lock);					\
+	if (!(page)->map_count) {					\
+		list_del(&(page)->lru);					\
+		this_lru->nr_cache_pages--;				\
+	} else								\
+		this_lru->nr_map_pages--;				\
+	spin_unlock(&this_lru->lock);					\
+} while (0)
 
 #endif /* __KERNEL__ */
 
diff -urN 2.4.0-test1-ac7/include/linux/mmzone.h 2.4.0-test1-ac7-VM-31/include/linux/mmzone.h
--- 2.4.0-test1-ac7/include/linux/mmzone.h	Sun May 28 20:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/mmzone.h	Sat Jun  3 15:45:16 2000
@@ -21,16 +21,26 @@
 
 struct pglist_data;
 
+/*
+ * Memory balancing internally to the node can work correctly only on
+ * classzone basis while handling overlapped classzones.
+ */
 typedef struct zone_struct {
 	/*
 	 * Commonly accessed fields:
 	 */
-	spinlock_t		lock;
 	unsigned long		offset;
 	unsigned long		free_pages;
-	char			low_on_memory;
-	char			zone_wake_kswapd;
+
+	/*
+	 * Memory balancing is all classzone based, all the below
+	 * fields refer to the classzone. The classzone includes
+	 * the current zone plus all the lower zones in the MM.
+	 */
+	unsigned long		classzone_free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	int			nr_zone;
+	char			zone_wake_kswapd;
 
 	/*
 	 * free areas of different sizes
@@ -57,27 +67,34 @@
 #define MAX_NR_ZONES		3
 
 /*
- * One allocation request operates on a zonelist. A zonelist
- * is a list of zones, the first one is the 'goal' of the
- * allocation, the other zones are fallback zones, in decreasing
- * priority.
- *
- * Right now a zonelist takes up less than a cacheline. We never
- * modify it apart from boot-up, and only a few indices are used,
- * so despite the zonelist table being relatively big, the cache
- * footprint of this construct is very small.
+ * The pgdat->node_gfpmask_zone[] array tell us which classzone
+ * we should allocate from given a certain gfpmask. It translates
+ * the gfpmask to a classzone.
  */
-typedef struct zonelist_struct {
-	zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited
+typedef struct gfpmask_zone_s {
+	zone_t * classzone;
 	int gfp_mask;
-} zonelist_t;
+} gfpmask_zone_t;
 
 #define NR_GFPINDEX		0x100
 
+#define LRU_SWAP_CACHE		0
+#define LRU_NORMAL_CACHE	1
+#define NR_LRU_CACHE		2
+typedef struct lru_cache_s {
+	struct list_head heads[NR_LRU_CACHE];
+	unsigned long nr_cache_pages; /* pages in the lrus */
+	unsigned long nr_map_pages; /* pages temporarly out of the lru */
+	/* keep lock in a separate cacheline to avoid ping pong in SMP */
+	spinlock_t lock ____cacheline_aligned_in_smp;
+} lru_cache_t;
+
 struct bootmem_data;
 typedef struct pglist_data {
+	int nr_zones;
 	zone_t node_zones[MAX_NR_ZONES];
-	zonelist_t node_zonelists[NR_GFPINDEX];
+	gfpmask_zone_t node_gfpmask_zone[NR_GFPINDEX];
+	lru_cache_t lru_cache;
 	struct page *node_mem_map;
 	unsigned long *valid_addr_bitmap;
 	struct bootmem_data *bdata;
@@ -86,14 +103,14 @@
 	unsigned long node_size;
 	int node_id;
 	struct pglist_data *node_next;
+	spinlock_t freelist_lock ____cacheline_aligned_in_smp;
 } pg_data_t;
 
 extern int numnodes;
 extern pg_data_t *pgdat_list;
 
 #define memclass(pgzone, tzone)	(((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \
-			&& (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \
-			((tzone) - (pgzone)->zone_pgdat->node_zones)))
+			&& ((pgzone) <= (tzone)))
 
 /*
  * The following two are not meant for general usage. They are here as
diff -urN 2.4.0-test1-ac7/include/linux/pagemap.h 2.4.0-test1-ac7-VM-31/include/linux/pagemap.h
--- 2.4.0-test1-ac7/include/linux/pagemap.h	Sun May 28 20:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/pagemap.h	Sat Jun  3 15:52:31 2000
@@ -80,8 +80,7 @@
 
 extern void __add_page_to_hash_queue(struct page * page, struct page **p);
 
-extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index);
-extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index);
+extern void add_to_swap_cache_locked(struct page * page, struct address_space *mapping, unsigned long index);
 
 extern inline void add_page_to_hash_queue(struct page * page, struct inode * inode, unsigned long index)
 {
diff -urN 2.4.0-test1-ac7/include/linux/sched.h 2.4.0-test1-ac7-VM-31/include/linux/sched.h
--- 2.4.0-test1-ac7/include/linux/sched.h	Sun May 28 20:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/sched.h	Sat Jun  3 15:45:16 2000
@@ -309,6 +309,7 @@
 	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+	int low_on_memory:1;
 	int swappable:1;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
diff -urN 2.4.0-test1-ac7/include/linux/swap.h 2.4.0-test1-ac7-VM-31/include/linux/swap.h
--- 2.4.0-test1-ac7/include/linux/swap.h	Sat Jun  3 14:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/swap.h	Sat Jun  3 16:23:22 2000
@@ -64,10 +64,8 @@
 };
 
 extern int nr_swap_pages;
-FASTCALL(unsigned int nr_free_pages(void));
-FASTCALL(unsigned int nr_free_buffer_pages(void));
-FASTCALL(unsigned int nr_free_highpages(void));
-extern int nr_lru_pages;
+extern unsigned long nr_free_pages(void);
+extern unsigned long nr_free_buffer_pages(void);
 extern atomic_t nr_async_pages;
 extern struct address_space swapper_space;
 extern atomic_t page_cache_size;
@@ -80,13 +78,13 @@
 
 struct zone_t;
 /* linux/ipc/shm.c */
-extern int shm_swap(int, int);
+extern int shm_swap(int, int, zone_t *);
 
 /* linux/mm/swap.c */
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(unsigned int gfp_mask);
+extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *, int);
@@ -147,57 +145,15 @@
  * swap IO on this page.  (The page cache _does_ count as another valid
  * reference to the page, however.)
  */
-static inline int is_page_shared(struct page *page)
+static inline int is_swap_cache_shared(struct page *page)
 {
 	unsigned int count;
-	if (PageReserved(page))
-		return 1;
+	if (PageReserved(page) || !PageSwapCache(page) || !PageLocked(page))
+		BUG();
 	count = page_count(page);
-	if (PageSwapCache(page))
-		count += swap_count(page) - 2 - !!page->buffers;
+	count += swap_count(page) - 2 - !!page->buffers;
 	return  count > 1;
 }
-
-extern spinlock_t pagemap_lru_lock;
-
-/*
- * Magic constants for page aging. If the system is programmed
- * right, tweaking these should have almost no effect...
- * The 2.4 code, however, is mostly simple and stable ;)
- */
-#define PG_AGE_MAX	64
-#define PG_AGE_START	5
-#define PG_AGE_ADV	3
-#define PG_AGE_DECL	1
-
-/*
- * Helper macros for lru_pages handling.
- */
-#define	lru_cache_add(page)			\
-do {						\
-	spin_lock(&pagemap_lru_lock);		\
-	list_add(&(page)->lru, &lru_cache);	\
-	nr_lru_pages++;				\
-	page->age = PG_AGE_START;		\
-	SetPageActive(page);			\
-	spin_unlock(&pagemap_lru_lock);		\
-} while (0)
-
-#define	__lru_cache_del(page)			\
-do {						\
-	list_del(&(page)->lru);			\
-	ClearPageActive(page);			\
-	nr_lru_pages--;				\
-} while (0)
-
-#define	lru_cache_del(page)			\
-do {						\
-	if (!PageLocked(page))			\
-		BUG();				\
-	spin_lock(&pagemap_lru_lock);		\
-	__lru_cache_del(page);			\
-	spin_unlock(&pagemap_lru_lock);		\
-} while (0)
 
 extern spinlock_t swaplock;
 
diff -urN 2.4.0-test1-ac7/include/linux/swapctl.h 2.4.0-test1-ac7-VM-31/include/linux/swapctl.h
--- 2.4.0-test1-ac7/include/linux/swapctl.h	Sun May 28 20:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/swapctl.h	Sat Jun  3 15:45:16 2000
@@ -11,8 +11,7 @@
 	unsigned int	max_percent;
 } buffer_mem_v1;
 typedef buffer_mem_v1 buffer_mem_t;
-extern buffer_mem_t buffer_mem;
-extern buffer_mem_t page_cache;
+extern buffer_mem_t lru_cache_mem;
 
 typedef struct freepages_v1
 {
diff -urN 2.4.0-test1-ac7/include/linux/sysctl.h 2.4.0-test1-ac7-VM-31/include/linux/sysctl.h
--- 2.4.0-test1-ac7/include/linux/sysctl.h	Sat May 13 10:15:20 2000
+++ 2.4.0-test1-ac7-VM-31/include/linux/sysctl.h	Sat Jun  3 14:53:15 2000
@@ -119,15 +119,18 @@
 enum
 {
 	VM_SWAPCTL=1,		/* struct: Set vm swapping control */
-	VM_SWAPOUT=2,		/* int: Linear or sqrt() swapout for hogs */
+	VM_SWAPOUT=2,		/* int: Background pageout interval */
 	VM_FREEPG=3,		/* struct: Set free page thresholds */
 	VM_BDFLUSH=4,		/* struct: Control buffer cache flushing */
 	VM_OVERCOMMIT_MEMORY=5,	/* Turn off the virtual memory safety limit */
+#if 0 /* obsolete but don't reuse */
 	VM_BUFFERMEM=6,		/* struct: Set buffer memory thresholds */
 	VM_PAGECACHE=7,		/* struct: Set cache memory thresholds */
+#endif
 	VM_PAGERDAEMON=8,	/* struct: Control kswapd behaviour */
 	VM_PGT_CACHE=9,		/* struct: Set page table cache parameters */
-	VM_PAGE_CLUSTER=10	/* int: set number of pages to swap together */
+	VM_PAGE_CLUSTER=10,	/* int: set number of pages to swap together */
+	VM_LRU_CACHE=11,	/* struct: Set lru cache memory thresholds */
 };
 
 
diff -urN 2.4.0-test1-ac7/ipc/shm.c 2.4.0-test1-ac7-VM-31/ipc/shm.c
--- 2.4.0-test1-ac7/ipc/shm.c	Fri May 26 22:47:09 2000
+++ 2.4.0-test1-ac7-VM-31/ipc/shm.c	Sat Jun  3 14:53:15 2000
@@ -132,7 +132,7 @@
 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 #endif
 
-static void zshm_swap (int prio, int gfp_mask);
+static void zshm_swap (int prio, zone_t *zone);
 static void zmap_unuse(swp_entry_t entry, struct page *page);
 static void shmzero_open(struct vm_area_struct *shmd);
 static void shmzero_close(struct vm_area_struct *shmd);
@@ -1411,7 +1411,7 @@
 #define RETRY	1
 #define FAILED	2
 
-static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, int *counter, struct page **outpage)
+static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage)
 {
 	pte_t page;
 	struct page *page_map;
@@ -1420,7 +1420,7 @@
 	if (!pte_present(page))
 		return RETRY;
 	page_map = pte_page(page);
-	if (page_map->zone->free_pages > page_map->zone->pages_high)
+	if (!memclass(page_map->zone, zone))
 		return RETRY;
 	if (shp->id != zero_id) swap_attempts++;
 
@@ -1473,22 +1473,26 @@
 static unsigned long swap_id; /* currently being swapped */
 static unsigned long swap_idx; /* next to swap */
 
-int shm_swap (int prio, int gfp_mask)
+int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
 	unsigned long id, idx;
-	int loop = 0;
+	int loop;
 	int counter;
 	struct page * page_map;
 
-	zshm_swap(prio, gfp_mask);
+	if (!(gfp_mask & __GFP_IO))
+		return 0;
+
+	zshm_swap(prio, zone);
 	counter = shm_rss / (prio + 1);
 	if (!counter)
 		return 0;
 	if (shm_swap_preop(&swap_entry))
 		return 0;
 
+	loop = 0;
 	shm_lockall();
 check_id:
 	shp = shm_get(swap_id);
@@ -1514,7 +1518,7 @@
 	if (idx >= shp->shm_npages)
 		goto next_id;
 
-	switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
 		case RETRY: goto check_table;
 		case FAILED: goto failed;
 	}
@@ -1800,7 +1804,7 @@
 	spin_unlock(&zmap_list_lock);
 }
 
-static void zshm_swap (int prio, int gfp_mask)
+static void zshm_swap (int prio, zone_t *zone)
 {
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
@@ -1845,7 +1849,7 @@
 		goto next_id;
 	}
 
-	switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
 		case RETRY: goto check_table;
 		case FAILED: goto failed;
 	}
diff -urN 2.4.0-test1-ac7/ipc/util.c 2.4.0-test1-ac7-VM-31/ipc/util.c
--- 2.4.0-test1-ac7/ipc/util.c	Fri May 26 22:47:09 2000
+++ 2.4.0-test1-ac7-VM-31/ipc/util.c	Sat Jun  3 14:53:15 2000
@@ -243,7 +243,7 @@
     return;
 }
 
-int shm_swap (int prio, int gfp_mask)
+int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
     return 0;
 }
diff -urN 2.4.0-test1-ac7/kernel/sysctl.c 2.4.0-test1-ac7-VM-31/kernel/sysctl.c
--- 2.4.0-test1-ac7/kernel/sysctl.c	Sat Jun  3 14:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/kernel/sysctl.c	Sat Jun  3 14:53:15 2000
@@ -233,16 +233,14 @@
 	 &bdflush_min, &bdflush_max},
 	{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
-	{VM_BUFFERMEM, "buffermem",
-	 &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGECACHE, "pagecache",
-	 &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
 	{VM_PAGERDAEMON, "kswapd",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
 	{VM_PGT_CACHE, "pagetable_cache", 
 	 &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_PAGE_CLUSTER, "page-cluster", 
 	 &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_LRU_CACHE, "lru_cache",
+	 &lru_cache_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff -urN 2.4.0-test1-ac7/mm/filemap.c 2.4.0-test1-ac7-VM-31/mm/filemap.c
--- 2.4.0-test1-ac7/mm/filemap.c	Sat Jun  3 14:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/mm/filemap.c	Sat Jun  3 15:11:13 2000
@@ -44,20 +44,12 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
-struct list_head lru_cache;
 
 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
-/*
- * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
- *       the pagemap_lru_lock held.
- */
-spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
 
 #define CLUSTER_PAGES		(1 << page_cluster)
 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
 
-#define min(a,b)		((a < b) ? a : b)
-
 void __add_page_to_hash_queue(struct page * page, struct page **p)
 {
 	atomic_inc(&page_cache_size);
@@ -69,7 +61,7 @@
 		PAGE_BUG(page);
 }
 
-static inline void remove_page_from_hash_queue(struct page * page)
+static void remove_page_from_hash_queue(struct page * page)
 {
 	if(page->pprev_hash) {
 		if(page->next_hash)
@@ -127,9 +119,7 @@
 	struct page * page;
 
 	head = &inode->i_mapping->pages;
-repeat:
 	spin_lock(&pagecache_lock);
-	spin_lock(&pagemap_lru_lock);
 	curr = head->next;
 
 	while (curr != head) {
@@ -139,53 +129,19 @@
 		/* We cannot invalidate a locked page */
 		if (TryLockPage(page))
 			continue;
-		if (page->buffers) {
-			page_cache_get(page);
-			spin_unlock(&pagemap_lru_lock);
-			spin_unlock(&pagecache_lock);			
-			block_destroy_buffers(page);
-			remove_inode_page(page);
-			lru_cache_del(page);
-			page_cache_release(page);
-			UnlockPage(page);
-			page_cache_release(page);
-			goto repeat;
-		}
-		__remove_inode_page(page);
-		__lru_cache_del(page);
+
+		lru_cache_del(page);
+
+		remove_page_from_inode_queue(page);
+		remove_page_from_hash_queue(page);
+		page->mapping = NULL;
+
 		UnlockPage(page);
 		page_cache_release(page);
 	}
-	spin_unlock(&pagemap_lru_lock);
 	spin_unlock(&pagecache_lock);
 }
 
-static inline void truncate_partial_page(struct page *page, unsigned partial)
-{
-	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-				
-	if (page->buffers)
-		block_flushpage(page, partial);
-
-}
-
-static inline void truncate_complete_page(struct page *page)
-{
-	if (page->buffers)
-		block_destroy_buffers(page);
-	lru_cache_del(page);
-	
-	/*
-	 * We remove the page from the page cache _after_ we have
-	 * destroyed all buffer-cache references to it. Otherwise some
-	 * other process might think this inode page is not in the
-	 * page cache and creates a buffer-cache alias to it causing
-	 * all sorts of fun problems ...  
-	 */
-	remove_inode_page(page);
-	page_cache_release(page);
-}
-
 /**
  * truncate_inode_pages - truncate *all* the pages from an offset
  * @mapping: mapping to truncate
@@ -201,39 +157,47 @@
 	struct page * page;
 	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	unsigned long start;
+	/*
+	 * Only one truncate can run at once so we can hide
+	 * processed pages into our local dispose list to decrease
+	 * the complexity of the `repeat` path.
+	 */
+	LIST_HEAD(dispose);
 
 	start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-repeat:
 	head = &mapping->pages;
+repeat:
 	spin_lock(&pagecache_lock);
-	curr = head->next;
-	while (curr != head) {
+	while ((curr = head->next) != head) {
 		unsigned long offset;
 
 		page = list_entry(curr, struct page, list);
-		curr = curr->next;
 		offset = page->index;
 
-		/* Is one of the pages to truncate? */
-		if ((offset >= start) || (partial && (offset + 1) == start)) {
-			if (TryLockPage(page)) {
-				page_cache_get(page);
-				spin_unlock(&pagecache_lock);
-				wait_on_page(page);
-				page_cache_release(page);
-				goto repeat;
-			}
-			page_cache_get(page);
+		/* page wholly truncated - free it */
+		if (offset >= start) {
+			get_page(page);
+			if (TryLockPage(page))
+				goto wait_unlock;
 			spin_unlock(&pagecache_lock);
 
-			if (partial && (offset + 1) == start) {
-				truncate_partial_page(page, partial);
-				partial = 0;
-			} else 
-				truncate_complete_page(page);
+			if (page->buffers)
+				block_destroy_buffers(page);
+			lru_cache_del(page);
+
+			/*
+			 * We remove the page from the page cache
+			 * _after_ we have destroyed all buffer-cache
+			 * references to it. Otherwise some other process
+			 * might think this inode page is not in the
+			 * page cache and creates a buffer-cache alias
+			 * to it causing all sorts of fun problems ...
+			 */
+			remove_inode_page(page);
 
 			UnlockPage(page);
+			put_page_raw(page);
 			page_cache_release(page);
 
 			/*
@@ -245,128 +209,134 @@
 			 */
 			goto repeat;
 		}
-	}
-	spin_unlock(&pagecache_lock);
-}
 
-/**
- * truncate_all_inode_pages - truncate *all* the pages
- * @mapping: mapping to truncate
- *
- * Truncate all the inode pages.  If any page is locked we wait for it
- * to become unlocked. This function can block.
- */
-void truncate_all_inode_pages(struct address_space * mapping)
-{
-	struct list_head *head, *curr;
-	struct page * page;
+		/*
+		 * there is only one partial page possible and it's the
+		 * one preceeding the first wholly truncated page.
+		 */
+		if (!partial || (offset + 1) != start) {
+			list_del(curr); list_add(curr, &dispose);
+			continue;
+		}
 
-	head = &mapping->pages;
-repeat:
-	spin_lock(&pagecache_lock);
-	spin_lock(&pagemap_lru_lock);
-	curr = head->next;
+		/* partial truncate, clear end of page */
+		get_page(page);
+		if (TryLockPage(page))
+			goto wait_unlock;
+		list_del(curr); /* page cache can grow under truncate */
+		spin_unlock(&pagecache_lock);
 
-	while (curr != head) {
-		page = list_entry(curr, struct page, list);
-		curr = curr->next;
+		/*
+		 * Nobody can try to list_del() the page pointed by `curr'
+		 * from under us (we hold a reference on the page) and
+		 * so we don't need the lock held while adding the page
+		 * to the local dispose list. We only need to insert curr
+		 * into our internal dispose list before releasing our
+		 * reference on the page.
+		 */
+#if 1
+		curr->next = curr->prev = NULL; /*
+						 * Trigger an oops if somebody
+						 * tries to unlink the page
+						 * under processing from the
+						 * cache. 
+						 */
+#endif
 
-		if (TryLockPage(page)) {
-			page_cache_get(page);
-			spin_unlock(&pagemap_lru_lock);
-			spin_unlock(&pagecache_lock);
-			wait_on_page(page);
-			page_cache_release(page);
-			goto repeat;
-		}
-		if (page->buffers) {
-			page_cache_get(page);
-			spin_unlock(&pagemap_lru_lock);
-			spin_unlock(&pagecache_lock);
-			block_destroy_buffers(page);
-			remove_inode_page(page);
-			lru_cache_del(page);
-			page_cache_release(page);
-			UnlockPage(page);
-			page_cache_release(page);
-			goto repeat;
-		}
-		__lru_cache_del(page);
-		__remove_inode_page(page);
+		memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+		if (page->buffers)
+			block_flushpage(page, partial);
+
+		partial = 0;
+
+		/*
+		 * we have dropped the spinlock so we have to
+		 * restart.
+		 */
 		UnlockPage(page);
+		/*
+		 * This is time to add `curr' to a valid list to allow
+		 * somebody else to unlink the page later.
+		 */
+		list_add(curr, &dispose);
 		page_cache_release(page);
-	}
+		goto repeat;
 
-	spin_unlock(&pagemap_lru_lock);
+	wait_unlock:
+		spin_unlock(&pagecache_lock);
+		___wait_on_page(page);
+		put_page(page);
+		goto repeat;
+	}
+	list_splice(&dispose, head);
 	spin_unlock(&pagecache_lock);
 }
 
-/*
- * nr_dirty represents the number of dirty pages that we will write async
- * before doing sync writes.  We can only do sync writes if we can
- * wait for IO (__GFP_IO set).
- */
-int shrink_mmap(int priority, int gfp_mask)
-{
-	int ret = 0, count, nr_dirty;
-	struct list_head * page_lru;
-	struct page * page = NULL;
-	
-	count = nr_lru_pages / (priority + 1);
-	nr_dirty = priority;
+static int FASTCALL(__shrink_mmap(int priority, zone_t *zone,
+				  unsigned long * __count,
+				  lru_cache_t * this_lru,
+				  int lru_type));
+static int __shrink_mmap(int priority, zone_t *zone,
+			 unsigned long * __count,
+			 lru_cache_t * this_lru,
+			 int lru_type)
+{
+	int ret = 0;
+	unsigned long count = *__count;
+	LIST_HEAD(young);
+	LIST_HEAD(old);
+	LIST_HEAD(forget);
+	struct list_head * page_lru, * dispose;
+	struct page * page;
+	spinlock_t * lru_lock = &this_lru->lock;
+	struct list_head * lru_head = &this_lru->heads[lru_type];
 
-	/* we need pagemap_lru_lock for list_del() ... subtle code below */
-	spin_lock(&pagemap_lru_lock);
-	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+	spin_lock(lru_lock);
+
+	while (count > 0 && (page_lru = lru_head->prev) != lru_head) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
 
-		if (PageTestandClearReferenced(page)) {
-			page->age += PG_AGE_ADV;
-			if (page->age > PG_AGE_MAX)
-				page->age = PG_AGE_MAX;
-			goto dispose_continue;
-		}
-		page->age -= min(PG_AGE_DECL, page->age);
-
-		if (page->age)
+		dispose = &old;
+		/* don't account passes over not DMA pages */
+		if (!memclass(page->zone, zone))
 			goto dispose_continue;
 
 		count--;
-		/*
-		 * Page is from a zone we don't care about.
-		 * Don't drop page cache entries in vain.
-		 */
-		if (page->zone->free_pages > page->zone->pages_high)
+
+		dispose = lru_head;
+		if (PageTestandClearReferenced(page))
+			/* Roll the page at the top of the lru list,
+			 * we could also be more aggressive putting
+			 * the page in the young-dispose-list, so
+			 * avoiding to free young pages in each pass.
+			 */
 			goto dispose_continue;
 
-		/*
-		 * Avoid unscalable SMP locking for pages we can
-		 * immediate tell are untouchable..
-		 */
+		dispose = &young;
+
+		/* avoid unscalable SMP locking */
 		if (!page->buffers && page_count(page) > 1)
 			goto dispose_continue;
 
 		if (TryLockPage(page))
 			goto dispose_continue;
 
-		/* Release the pagemap_lru lock even if the page is not yet
-		   queued in any lru queue since we have just locked down
-		   the page so nobody else may SMP race with us running
-		   a lru_cache_del() (lru_cache_del() always run with the
-		   page locked down ;). */
-		spin_unlock(&pagemap_lru_lock);
+		if (PageTestandSetOutLru(page))
+			BUG();
+		/*
+		 * We can release the lru_cache lock even if the page is not
+		 * queued in any list because we have just locked down
+		 * the page and marked the page as out of the lru list.
+		 */
+		spin_unlock(lru_lock);
 
 		/* avoid freeing the page while it's locked */
-		page_cache_get(page);
+		get_page(page);
 
-		/*
-		 * Is it a buffer page? Try to clean it up regardless
-		 * of zone - it's old.
-		 */
+		/* Is it a buffer page? */
 		if (page->buffers) {
-			int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
-			if (!try_to_free_buffers(page, wait))
+			if (!try_to_free_buffers(page))
 				goto unlock_continue;
 			/* page was locked, inode can't go away under us */
 			if (!page->mapping) {
@@ -394,45 +364,45 @@
 		 * were to be marked referenced..
 		 */
 		if (PageSwapCache(page)) {
-			if (!PageDirty(page)) {
-				spin_unlock(&pagecache_lock);
-				__delete_from_swap_cache(page);
-				goto made_inode_progress;
-			}
-			/* PageDeferswap -> we swap out the page now. */
-			if (gfp_mask & __GFP_IO)
-				goto async_swap_continue;
-			goto cache_unlock_continue;
-		}
+			spin_unlock(&pagecache_lock);
+			__delete_from_swap_cache(page);
+			goto made_inode_progress;
+		}	
 
 		/* is it a page-cache page? */
 		if (page->mapping) {
-			if (!PageDirty(page) && !pgcache_under_min()) {
-				__remove_inode_page(page);
+			if (!PageDirty(page)) {
+				remove_page_from_inode_queue(page);
+				remove_page_from_hash_queue(page);
+				page->mapping = NULL;
 				spin_unlock(&pagecache_lock);
 				goto made_inode_progress;
 			}
 			goto cache_unlock_continue;
 		}
 
+		dispose = &forget;
 		printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 
 cache_unlock_continue:
 		spin_unlock(&pagecache_lock);
 unlock_continue:
-		spin_lock(&pagemap_lru_lock);
+		spin_lock(lru_lock);
 		UnlockPage(page);
-		page_cache_release(page);
-		goto dispose_continue;
-async_swap_continue:
-		spin_unlock(&pagecache_lock);
-		/* Do NOT unlock the page ... that is done after IO. */
-		ClearPageDirty(page);
-		rw_swap_page(WRITE, page, 0);
-		spin_lock(&pagemap_lru_lock);
-		page_cache_release(page);
+		put_page(page);
+
+		if (!page->map_count || page->buffers)
+			list_add(page_lru, dispose);
+		else {
+			this_lru->nr_cache_pages--;
+			this_lru->nr_map_pages++;
+		}
+		if (!PageTestandClearOutLru(page))
+			BUG();
+		continue;
+
 dispose_continue:
-		list_add(page_lru, &lru_cache);
+		list_add(page_lru, dispose);
 	}
 	goto out;
 
@@ -440,18 +410,44 @@
 	page_cache_release(page);
 made_buffer_progress:
 	UnlockPage(page);
-	page_cache_release(page);
+	if (!PageTestandClearOutLru(page))
+		BUG();
+	put_page(page);
 	ret = 1;
-	spin_lock(&pagemap_lru_lock);
-	/* nr_lru_pages needs the spinlock */
-	nr_lru_pages--;
+	spin_lock(lru_lock);
+	/* nr_pages needs the spinlock */
+	this_lru->nr_cache_pages--;
 
 out:
-	spin_unlock(&pagemap_lru_lock);
+	list_splice(&young, lru_head);
+	list_splice(&old, lru_head->prev);
+
+	spin_unlock(lru_lock);
 
+	*__count = count;
 	return ret;
 }
 
+int shrink_mmap(int priority, zone_t *zone)
+{
+	lru_cache_t * this_lru;
+	unsigned long count;
+	int i;
+
+	this_lru = &zone->zone_pgdat->lru_cache;
+
+	count = this_lru->nr_cache_pages;
+	if (lru_cache_under_min(count))
+		return 0;
+
+	count /= priority + 1;
+
+	for (i = 0; i < NR_LRU_CACHE; i++)
+		if (__shrink_mmap(priority, zone, &count, this_lru, i))
+			return 1;
+	return 0;
+}
+
 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 {
 	goto inside;
@@ -526,7 +522,7 @@
 		if (page->index < start)
 			continue;
 
-		page_cache_get(page);
+		get_page(page);
 		spin_unlock(&pagecache_lock);
 		lock_page(page);
 
@@ -563,18 +559,18 @@
  * The caller must have locked the page and 
  * set all the page flags correctly..
  */
-void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
+void add_to_swap_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 {
 	if (!PageLocked(page))
 		BUG();
 
-	page_cache_get(page);
-	spin_lock(&pagecache_lock);
+	get_page(page);
 	page->index = index;
+	spin_lock(&pagecache_lock);
 	add_page_to_inode_queue(mapping, page);
 	__add_page_to_hash_queue(page, page_hash(mapping, index));
-	lru_cache_add(page);
 	spin_unlock(&pagecache_lock);
+	lru_cache_add(page, LRU_SWAP_CACHE);
 }
 
 /*
@@ -591,25 +587,18 @@
 	if (PageLocked(page))
 		BUG();
 
-	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty));
-	page->flags = flags | (1 << PG_locked) | (1 << PG_referenced);
-	page_cache_get(page);
+	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
+	page->flags = flags | (1 << PG_locked);
+	get_page(page);
 	page->index = offset;
 	add_page_to_inode_queue(mapping, page);
 	__add_page_to_hash_queue(page, hash);
-	lru_cache_add(page);
+	lru_cache_add(page, LRU_NORMAL_CACHE);
 	alias = __find_page_nolock(mapping, offset, *hash);
 	if (alias != page)
 		BUG();
 }
 
-void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
-{
-	spin_lock(&pagecache_lock);
-	__add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
-	spin_unlock(&pagecache_lock);
-}
-
 static int add_to_page_cache_unique(struct page * page,
 	struct address_space *mapping, unsigned long offset,
 	struct page **hash)
@@ -735,7 +724,7 @@
 	spin_lock(&pagecache_lock);
 	page = __find_page_nolock(mapping, offset, *hash);
 	if (page)
-		page_cache_get(page);
+		get_page(page);
 	spin_unlock(&pagecache_lock);
 
 	/* Found the page, sleep if locked. */
@@ -785,7 +774,7 @@
 	spin_lock(&pagecache_lock);
 	page = __find_page_nolock(mapping, offset, *hash);
 	if (page)
-		page_cache_get(page);
+		get_page(page);
 	spin_unlock(&pagecache_lock);
 
 	/* Found the page, sleep if locked. */
@@ -1143,7 +1132,7 @@
 		if (!page)
 			goto no_cached_page;
 found_page:
-		page_cache_get(page);
+		get_page(page);
 		spin_unlock(&pagecache_lock);
 
 		if (!Page_Uptodate(page))
@@ -1521,6 +1510,7 @@
 		struct page *new_page = page_cache_alloc();
 
 		if (new_page) {
+			page_anon_init_map_wmb(new_page);
 			copy_user_highpage(new_page, old_page, address);
 			flush_page_to_ram(new_page);
 		} else
@@ -1530,6 +1520,7 @@
 	}
 
 	flush_page_to_ram(old_page);
+	lru_cache_map(old_page);
 	return old_page;
 
 no_cached_page:
@@ -1646,7 +1637,8 @@
 		set_pte(ptep, pte_mkclean(pte));
 		flush_tlb_page(vma, address);
 		page = pte_page(pte);
-		page_cache_get(page);
+		page_map(page);
+		get_page(page);
 	} else {
 		if (pte_none(pte))
 			return 0;
@@ -1659,6 +1651,7 @@
 		}
 		page = pte_page(pte);
 		if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
+			lru_cache_unmap(page, LRU_NORMAL_CACHE);
 			page_cache_free(page);
 			return 0;
 		}
@@ -1672,6 +1665,7 @@
 	lock_page(page);
 	error = filemap_write_page(vma->vm_file, page, 1);
 	UnlockPage(page);
+	lru_cache_unmap(page, LRU_NORMAL_CACHE);
 	page_cache_free(page);
 	return error;
 }
diff -urN 2.4.0-test1-ac7/mm/highmem.c 2.4.0-test1-ac7-VM-31/mm/highmem.c
--- 2.4.0-test1-ac7/mm/highmem.c	Fri May 26 22:47:10 2000
+++ 2.4.0-test1-ac7-VM-31/mm/highmem.c	Sat Jun  3 14:53:15 2000
@@ -29,8 +29,7 @@
  */
 struct page * prepare_highmem_swapout(struct page * page)
 {
-	struct page *new_page;
-	unsigned long regular_page;
+	struct page * regular_page;
 	unsigned long vaddr;
 	/*
 	 * If this is a highmem page so it can't be swapped out directly
@@ -48,22 +47,25 @@
 	 * across a fork().
 	 */
 	UnlockPage(page);
-	regular_page = __get_free_page(GFP_ATOMIC);
+	regular_page = alloc_page(GFP_ATOMIC);
 	if (!regular_page)
 		return NULL;
 
 	vaddr = kmap(page);
-	copy_page((void *)regular_page, (void *)vaddr);
+	copy_page((void *)page_address(regular_page), (void *)vaddr);
 	kunmap(page);
 
 	/*
 	 * ok, we can just forget about our highmem page since 
 	 * we stored its data into the new regular_page.
 	 */
+	if (page->map_count) {
+		regular_page->map_count = 1;
+		page_unmap(page);
+	}
 	page_cache_release(page);
-	new_page = mem_map + MAP_NR(regular_page);
-	LockPage(new_page);
-	return new_page;
+	LockPage(regular_page);
+	return regular_page;
 }
 
 struct page * replace_with_highmem(struct page * page)
@@ -86,9 +88,8 @@
 	copy_page((void *)vaddr, (void *)page_address(page));
 	kunmap(highpage);
 
-	/* Preserve the caching of the swap_entry. */
-	highpage->index = page->index;
-	highpage->mapping = page->mapping;
+	if (page->mapping)
+		BUG();
 
 	/*
 	 * We can just forget the old page since 
diff -urN 2.4.0-test1-ac7/mm/memory.c 2.4.0-test1-ac7-VM-31/mm/memory.c
--- 2.4.0-test1-ac7/mm/memory.c	Sat Jun  3 14:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/mm/memory.c	Sat Jun  3 15:27:14 2000
@@ -156,6 +156,7 @@
 	unsigned long address = vma->vm_start;
 	unsigned long end = vma->vm_end;
 	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+	spinlock_t * pte_lock = &vma->vm_mm->page_table_lock;
 
 	src_pgd = pgd_offset(src, address)-1;
 	dst_pgd = pgd_offset(dst, address)-1;
@@ -208,9 +209,16 @@
 			src_pte = pte_offset(src_pmd, address);
 			dst_pte = pte_offset(dst_pmd, address);
 			
+			/*
+			 * This spin_lock wouldn't be necessary right now
+			 * since everybody holds the kernel lock but
+			 * it will become necessary later.
+			 */
+			spin_lock(pte_lock);
 			do {
 				pte_t pte = *src_pte;
 				unsigned long page_nr;
+				struct page * page;
 				
 				/* copy_one_pte */
 
@@ -235,15 +243,19 @@
 				/* If it's a shared mapping, mark it clean in the child */
 				if (vma->vm_flags & VM_SHARED)
 					pte = pte_mkclean(pte);
+				page = &mem_map[page_nr];
+				if (page->map_count)
+					page_map(page);
+				get_page(page);
 				set_pte(dst_pte, pte_mkold(pte));
-				get_page(mem_map + page_nr);
 			
 cont_copy_pte_range:		address += PAGE_SIZE;
 				if (address >= end)
-					goto out;
+					goto out_unlock;
 				src_pte++;
 				dst_pte++;
 			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+			spin_unlock(pte_lock);
 		
 cont_copy_pmd_range:	src_pmd++;
 			dst_pmd++;
@@ -251,6 +263,9 @@
 	}
 out:
 	return 0;
+out_unlock:
+	spin_unlock(pte_lock);
+	return 0;
 
 nomem:
 	return -ENOMEM;
@@ -259,20 +274,21 @@
 /*
  * Return indicates whether a page was freed so caller can adjust rss
  */
-static inline int free_pte(pte_t page)
+static inline int free_pte(pte_t pte)
 {
-	if (pte_present(page)) {
-		unsigned long nr = pte_pagenr(page);
-		if (nr >= max_mapnr || PageReserved(mem_map+nr))
+	if (pte_present(pte)) {
+		unsigned long nr = pte_pagenr(pte);
+		struct page * page = mem_map + nr;
+		if (nr >= max_mapnr || PageReserved(page))
 			return 0;
 		/* 
 		 * free_page() used to be able to clear swap cache
 		 * entries.  We may now have to do it manually.  
 		 */
-		free_page_and_swap_cache(mem_map+nr);
+		free_page_and_swap_cache(page);
 		return 1;
 	}
-	swap_free(pte_to_swp_entry(page));
+	swap_free(pte_to_swp_entry(pte));
 	return 0;
 }
 
@@ -781,8 +797,14 @@
  */
 static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
 {
-	flush_tlb_page(vma, address);
 	set_pte(page_table, entry);
+	/*
+	 * Sorry but with the current linux VM design the tlb flush have
+	 * to happen after setting the pte or threads will break in SMP
+	 * for everybody (yes, also in the architectures that can't flush
+	 * the tlb without the valid pte entry in place).
+	 */
+	flush_tlb_page(vma, address);
 	update_mmu_cache(vma, address, entry);
 }
 
@@ -792,7 +814,7 @@
 	copy_cow_page(old_page,new_page,address);
 	flush_page_to_ram(new_page);
 	flush_cache_page(vma, address);
-	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+	establish_pte(vma, address, page_table, pte_mkyoung(pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))));
 }
 
 /*
@@ -836,6 +858,7 @@
 	 */
 	switch (page_count(old_page)) {
 	case 2:
+	case 3:
 		/*
 		 * Lock the page so that no one can look it up from
 		 * the swap cache, grab a reference and start using it.
@@ -843,14 +866,20 @@
 		 */
 		if (!PageSwapCache(old_page) || TryLockPage(old_page))
 			break;
-		if (is_page_shared(old_page)) {
+		if (is_swap_cache_shared(old_page)) {
 			UnlockPage(old_page);
 			break;
 		}
-		SetPageDirty(old_page);
+		lru_cache_unmap(old_page, LRU_SWAP_CACHE);
+		delete_from_swap_cache_nolock(old_page);
 		UnlockPage(old_page);
+		page_anon_init_map_wmb(old_page);
 		/* FallThrough */
 	case 1:
+		if (PageReserved(old_page))
+			break;
+		if (old_page->map_count != 1)
+			BUG();
 		flush_cache_page(vma, address);
 		establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 		spin_unlock(&mm->page_table_lock);
@@ -870,8 +899,20 @@
 	 * Re-check the pte - we dropped the lock
 	 */
 	if (pte_val(*page_table) == pte_val(pte)) {
-		if (PageReserved(old_page))
+		if (!PageReserved(old_page)) {
+			/* SHM memory is never write protected */
+			if (!old_page->map_count)
+				BUG();
+			lru_cache_unmap(old_page,
+					!PageSwapCache(old_page) ?
+					LRU_NORMAL_CACHE :
+					LRU_SWAP_CACHE);
+		} else {
+			if (old_page->map_count)
+				BUG();
 			++mm->rss;
+		}
+		page_anon_init_map_wmb(new_page);
 		break_cow(vma, old_page, new_page, address, page_table);
 
 		/* Free the old page.. */
@@ -1058,14 +1099,22 @@
 	 */
 	lock_page(page);
 	swap_free(entry);
-	if (write_access && !is_page_shared(page) && nr_free_highpages()) {
+	if (write_access && !is_swap_cache_shared(page)) {
 		delete_from_swap_cache_nolock(page);
 		UnlockPage(page);
 		page = replace_with_highmem(page);
+		page_anon_init_map_wmb(page);
 		pte = mk_pte(page, vma->vm_page_prot);
 		pte = pte_mkwrite(pte_mkdirty(pte));
-	} else
+	} else {
 		UnlockPage(page);
+		/*
+		 * No need of the page lock, we have the PG_out_lru
+		 * that avoids us to list_del a page that isn't in the
+		 * lru.
+		 */
+		lru_cache_map(page);
+	}
 
 	set_pte(page_table, pte);
 	/* No need to invalidate - it was non-present before */
@@ -1078,15 +1127,13 @@
  */
 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 {
-	int high = 0;
 	struct page *page = NULL;
 	pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 	if (write_access) {
 		page = alloc_page(GFP_HIGHUSER);
 		if (!page)
 			return -1;
-		if (PageHighMem(page))
-			high = 1;
+		page_anon_init_map_wmb(page);
 		clear_user_highpage(page, addr);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		mm->rss++;
diff -urN 2.4.0-test1-ac7/mm/mmap.c 2.4.0-test1-ac7-VM-31/mm/mmap.c
--- 2.4.0-test1-ac7/mm/mmap.c	Thu Apr 27 08:56:45 2000
+++ 2.4.0-test1-ac7-VM-31/mm/mmap.c	Sat Jun  3 14:53:15 2000
@@ -56,7 +56,7 @@
 	 * of num_physpages for safety margin.
 	 */
 
-	long free;
+	unsigned long free;
 	
         /* Sometimes we want to use more memory than we have. */
 	if (sysctl_overcommit_memory)
diff -urN 2.4.0-test1-ac7/mm/numa.c 2.4.0-test1-ac7-VM-31/mm/numa.c
--- 2.4.0-test1-ac7/mm/numa.c	Tue Apr 18 07:11:42 2000
+++ 2.4.0-test1-ac7-VM-31/mm/numa.c	Sat Jun  3 14:53:15 2000
@@ -33,7 +33,7 @@
 
 struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
 {
-	return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order);
+	return __alloc_pages(NODE_DATA(nid)->node_gfpmask_zone + gfp_mask, order);
 }
 
 #ifdef CONFIG_DISCONTIGMEM
diff -urN 2.4.0-test1-ac7/mm/page_alloc.c 2.4.0-test1-ac7-VM-31/mm/page_alloc.c
--- 2.4.0-test1-ac7/mm/page_alloc.c	Sat Jun  3 14:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/mm/page_alloc.c	Sat Jun  3 16:26:49 2000
@@ -25,11 +25,10 @@
 #endif
 
 int nr_swap_pages;
-int nr_lru_pages;
 pg_data_t *pgdat_list;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 512, };
+static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 128, };
 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
 
@@ -70,6 +69,8 @@
 	free_area_t *area;
 	struct page *base;
 	zone_t *zone;
+	spinlock_t * freelist_lock;
+	pg_data_t * pgdat;
 
 	/*
 	 * Subtle. We do not want to test this in the inlined part of
@@ -95,6 +96,10 @@
 		BUG();
 	if (PageDirty(page))
 		BUG();
+	if (PageOutLru(page))
+		BUG();
+	if (page->map_count)
+		BUG();
 
 	zone = page->zone;
 
@@ -107,10 +112,25 @@
 
 	area = zone->free_area + order;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	pgdat = zone->zone_pgdat;
+	freelist_lock = &pgdat->freelist_lock;
+	spin_lock_irqsave(freelist_lock, flags);
 
 	zone->free_pages -= mask;
 
+	/* update the classzone */
+	{
+		int nr_zone = zone->nr_zone;
+		register zone_t * z = zone;
+		do {
+			z->classzone_free_pages -= mask;
+			if (z->zone_wake_kswapd &&
+			    z->classzone_free_pages > z->pages_high)
+				z->zone_wake_kswapd = 0;
+			z++;
+		} while (++nr_zone < pgdat->nr_zones);
+	}
+
 	while (mask + (1 << (MAX_ORDER-1))) {
 		struct page *buddy1, *buddy2;
 
@@ -138,16 +158,7 @@
 		page_idx &= mask;
 	}
 	memlist_add_head(&(base + page_idx)->list, &area->free_list);
-
-	spin_unlock_irqrestore(&zone->lock, flags);
-
-	if (zone->free_pages >= zone->pages_low) {
-		zone->low_on_memory = 0;
-	}
-
-	if (zone->free_pages >= zone->pages_high) {
-		zone->zone_wake_kswapd = 0;
-	}
+	spin_unlock_irqrestore(freelist_lock, flags);
 }
 
 #define MARK_USED(index, order, area) \
@@ -174,16 +185,14 @@
 	return page;
 }
 
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
-static struct page * rmqueue(zone_t *zone, unsigned long order)
+static inline struct page * rmqueue(zone_t *zone, unsigned long order, unsigned long flags)
 {
 	free_area_t * area = zone->free_area + order;
 	unsigned long curr_order = order;
 	struct list_head *head, *curr;
-	unsigned long flags;
 	struct page *page;
+	pg_data_t * pgdat;
 
-	spin_lock_irqsave(&zone->lock, flags);
 	do {
 		head = &area->free_list;
 		curr = memlist_next(head);
@@ -197,10 +206,21 @@
 			memlist_del(curr);
 			index = (page - mem_map) - zone->offset;
 			MARK_USED(index, curr_order, area);
-			zone->free_pages -= 1 << order;
+
+			zone->free_pages -= 1UL << order;
+			pgdat = zone->zone_pgdat;
+			/* update the classzone */
+			{
+				int nr_zone = zone->nr_zone;
+				register zone_t * z = zone;
+				do {
+					z->classzone_free_pages -= 1UL<<order;
+					z++;
+				} while (++nr_zone < pgdat->nr_zones);
+			}
 
 			page = expand(zone, page, index, order, curr_order, area);
-			spin_unlock_irqrestore(&zone->lock, flags);
+			spin_unlock_irqrestore(&pgdat->freelist_lock, flags);
 
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
@@ -210,7 +230,6 @@
 		curr_order++;
 		area++;
 	} while (curr_order < MAX_ORDER);
-	spin_unlock_irqrestore(&zone->lock, flags);
 
 	return NULL;
 }
@@ -218,141 +237,130 @@
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
-struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
+struct page * __alloc_pages(gfpmask_zone_t * gfpmask_zone, unsigned long order)
 {
-	zone_t **zone = zonelist->zones;
-	extern wait_queue_head_t kswapd_wait;
+	zone_t * classzone = gfpmask_zone->classzone;
+	pg_data_t * pgdat = classzone->zone_pgdat;
+	int freed;
+	spinlock_t * freelist_lock = &pgdat->freelist_lock;
+	long flags;
+	long free_pages;
+	unsigned long size = 1UL << order;
+
+	spin_lock_irqsave(freelist_lock, flags);
 
 	/*
-	 * (If anyone calls gfp from interrupts nonatomically then it
-	 * will sooner or later tripped up by a schedule().)
-	 *
-	 * We are falling back to lower-level zones if allocation
-	 * in a higher zone fails.
+	 * If this is a recursive call, we'd better
+	 * do our best to just allocate things without
+	 * further thought.
 	 */
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (!z->size)
-			BUG();
+	if (current->flags & PF_MEMALLOC)
+		goto allocate_ok;
 
-		/* Are we supposed to free memory? Don't make it worse.. */
-		if (!z->zone_wake_kswapd) {
-			struct page *page = rmqueue(z, order);
-			if (z->free_pages < z->pages_low) {
-				z->zone_wake_kswapd = 1;
+	/* classzone based memory balancing */
+	free_pages = classzone->classzone_free_pages;
+	if (!current->low_on_memory &&
+	    free_pages > classzone->pages_low) {
+		int nr_zone;
+		zone_t * z;
+
+	allocate_ok:
+		z = classzone;
+		for (nr_zone = classzone->nr_zone;
+		     nr_zone >= 0;
+		     nr_zone--, z--) {
+			if (z->free_pages >= size) {
+				struct page *page = rmqueue(z, order, flags);
+				if (page)
+					return page;
 			}
-			if (page)
-				return page;
 		}
-	}
+	} else {
+		extern wait_queue_head_t kswapd_wait;
 
-	/* All zones are in need of kswapd. */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+		if (free_pages > classzone->pages_low) {
+		high_mem:
+			if (current->low_on_memory)
+				current->low_on_memory = 0;
+			goto allocate_ok;
+		}
 
-	/*
-	 * Ok, we don't have any zones that don't need some
-	 * balancing.. See if we have any that aren't critical..
-	 */
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (!z->low_on_memory) {
-			struct page *page = rmqueue(z, order);
-			if (z->free_pages < (z->pages_min + z->pages_low) / 2)
-				z->low_on_memory = 1;
-			if (page)
-				return page;
+		if (!classzone->zone_wake_kswapd) {
+			classzone->zone_wake_kswapd = 1;
+			wake_up_interruptible(&kswapd_wait);
 		}
-	}
 
-	/*
-	 * Uhhuh. All the zones have been critical, which means that
-	 * we'd better do some synchronous swap-out. kswapd has not
-	 * been able to cope..
-	 */
-	if (!(current->flags & PF_MEMALLOC)) {
-		int gfp_mask = zonelist->gfp_mask;
-		if (!try_to_free_pages(gfp_mask)) {
-			if (!(gfp_mask & __GFP_HIGH))
-				goto fail;
+		/* Are we reaching the critical stage? */
+		if (!current->low_on_memory) {
+			/* Not yet critical, so let kswapd handle it.. */
+			if (free_pages > classzone->pages_min)
+				goto allocate_ok;
+			current->low_on_memory = 1;
 		}
-	}
 
-	/*
-	 * We freed something, so we're allowed to allocate anything we can!
-	 */
-	zone = zonelist->zones;
-	for (;;) {
-		struct page *page;
+		spin_unlock_irqrestore(freelist_lock, flags);
+		freed = try_to_free_pages(gfpmask_zone->gfp_mask, classzone);
+		spin_lock_irq(freelist_lock);
 
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		page = rmqueue(z, order);
-		if (page)
-			return page;
-	}
+		if (freed || gfpmask_zone->gfp_mask & __GFP_HIGH)
+			goto allocate_ok;
 
-fail:
-	/* Last try, zone->low_on_memory isn't reset until we hit pages_low */
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		int gfp_mask = zonelist->gfp_mask;
-		if (!z)
-			break;
-		if (z->free_pages > z->pages_min) {
-			struct page *page = rmqueue(z, order);
-			if (page)
-				return page;
-		}
+		/*
+		 * Re-check we're low on memory keeping the spinlock held
+		 * before failing. Somebody may have released
+		 * lots of memory from under us while we was trying
+		 * to free the pages. We check against pages_high
+		 * to be sure to succeed only if lots of memory is been
+		 * released.
+		 */
+		free_pages = classzone->classzone_free_pages;
+		if (free_pages > classzone->pages_high)
+			goto high_mem;
 	}
-	/* No luck.. */
+	spin_unlock_irqrestore(freelist_lock, flags);
 	return NULL;
 }
 
 /*
  * Total amount of free (allocatable) RAM:
  */
-unsigned int nr_free_pages (void)
+unsigned long nr_free_pages (void)
 {
-	unsigned int sum;
-	zone_t *zone;
+	unsigned long sum;
 	int i;
 
 	sum = 0;
-	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
-			sum += zone->free_pages;
+	for (i = 0; i < NUMNODES; i++) {
+		pg_data_t * pgdat = NODE_DATA(i);
+		zone_t * node_zones = pgdat->node_zones;
+		sum += node_zones[pgdat->nr_zones-1].classzone_free_pages;
+	}
 	return sum;
 }
 
 /*
  * Amount of free RAM allocatable as buffer memory:
  */
-unsigned int nr_free_buffer_pages (void)
+unsigned long nr_free_buffer_pages (void)
 {
-	unsigned int sum;
-	zone_t *zone;
+	unsigned long sum = 0;
 	int i;
 
-	sum = nr_lru_pages;
-	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
-			sum += zone->free_pages;
+	for (i = 0; i < NUMNODES; i++) {
+		pg_data_t * pgdat = NODE_DATA(i);
+		zone_t * node_zones = pgdat->node_zones;
+		int higher_zone = pgdat->nr_zones-1;
+		sum += pgdat->lru_cache.nr_cache_pages;
+		sum += node_zones[higher_zone <= ZONE_NORMAL ? higher_zone : ZONE_NORMAL].classzone_free_pages;
+	}
 	return sum;
 }
 
 #if CONFIG_HIGHMEM
-unsigned int nr_free_highpages (void)
+unsigned long nr_free_highpages (void)
 {
 	int i;
-	unsigned int pages = 0;
+	unsigned long pages = 0;
 
 	for (i = 0; i < NUMNODES; i++)
 		pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages;
@@ -367,30 +375,33 @@
  */
 void show_free_areas_core(int nid)
 {
- 	unsigned long order;
+ 	unsigned long order, flags;
 	unsigned type;
+	pg_data_t * pgdat = NODE_DATA(nid);
+	spinlock_t * freelist_lock = &pgdat->freelist_lock;
 
-	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
+	printk("Free pages:      %6lukB (%6lukB HighMem)\n",
 		nr_free_pages() << (PAGE_SHIFT-10),
 		nr_free_highpages() << (PAGE_SHIFT-10));
 
-	printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
+	printk("( Free: %lu, cache: %lu map: %lu (%d %d %d) )\n",
 		nr_free_pages(),
-		nr_lru_pages,
+		NODE_DATA(nid)->lru_cache.nr_cache_pages,
+		NODE_DATA(nid)->lru_cache.nr_map_pages,
 		freepages.min,
 		freepages.low,
 		freepages.high);
 
+	spin_lock_irqsave(freelist_lock, flags);
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
-		zone_t *zone = NODE_DATA(nid)->node_zones + type;
- 		unsigned long nr, total, flags;
+		zone_t *zone = pgdat->node_zones + type;
+ 		unsigned long nr, total;
 
-		printk("  %s: ", zone->name);
+		printk("%s: ", zone->name);
 
 		total = 0;
 		if (zone->size) {
-			spin_lock_irqsave(&zone->lock, flags);
 		 	for (order = 0; order < MAX_ORDER; order++) {
 				head = &(zone->free_area + order)->free_list;
 				curr = head;
@@ -405,10 +416,15 @@
 				printk("%lu*%lukB ", nr,
 						(PAGE_SIZE>>10) << order);
 			}
-			spin_unlock_irqrestore(&zone->lock, flags);
+			if (total != zone->free_pages)
+				printk("error %lu ",
+				       zone->free_pages * (PAGE_SIZE>>10));
 		}
-		printk("= %lukB)\n", total * (PAGE_SIZE>>10));
+		printk("= %lukB", total * (PAGE_SIZE>>10));
+		printk(" class %ldkB\n",
+		       zone->classzone_free_pages * (PAGE_SIZE>>10));
 	}
+	spin_unlock_irqrestore(freelist_lock, flags);
 
 #ifdef SWAP_CACHE_INFO
 	show_swap_cache_info();
@@ -423,18 +439,17 @@
 /*
  * Builds allocation fallback zone lists.
  */
-static inline void build_zonelists(pg_data_t *pgdat)
+static void __init build_gfpmask_zone(pg_data_t *pgdat)
 {
 	int i, j, k;
 
 	for (i = 0; i < NR_GFPINDEX; i++) {
-		zonelist_t *zonelist;
+		gfpmask_zone_t * gfpmask_zone;
 		zone_t *zone;
 
-		zonelist = pgdat->node_zonelists + i;
-		memset(zonelist, 0, sizeof(*zonelist));
+		gfpmask_zone = pgdat->node_gfpmask_zone + i;
 
-		zonelist->gfp_mask = i;
+		gfpmask_zone->gfp_mask = i;
 		j = 0;
 		k = ZONE_NORMAL;
 		if (i & __GFP_HIGHMEM)
@@ -454,21 +469,37 @@
 #ifndef CONFIG_HIGHMEM
 					BUG();
 #endif
-					zonelist->zones[j++] = zone;
+					gfpmask_zone->classzone = zone;
+					break;
 				}
 			case ZONE_NORMAL:
 				zone = pgdat->node_zones + ZONE_NORMAL;
-				if (zone->size)
-					zonelist->zones[j++] = zone;
+				if (zone->size) {
+					gfpmask_zone->classzone = zone;
+					break;
+				}
 			case ZONE_DMA:
 				zone = pgdat->node_zones + ZONE_DMA;
-				if (zone->size)
-					zonelist->zones[j++] = zone;
+				if (zone->size) {
+					gfpmask_zone->classzone = zone;
+					break;
+				}
 		}
-		zonelist->zones[j++] = NULL;
 	} 
 }
 
+static void __init lru_cache_init(pg_data_t * pgdat)
+{
+	int i;
+	lru_cache_t * this_lru = &pgdat->lru_cache;
+
+	for (i = 0; i < NR_LRU_CACHE; i++)
+		INIT_LIST_HEAD(&this_lru->heads[i]);
+	this_lru->nr_cache_pages = 0;
+	this_lru->nr_map_pages = 0;
+	spin_lock_init(&this_lru->lock);
+}
+
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
 /*
@@ -485,7 +516,7 @@
 	unsigned long i, j;
 	unsigned long map_size;
 	unsigned long totalpages, offset, realtotalpages;
-	unsigned int cumulative = 0;
+	unsigned long classzonepages;
 
 	pgdat->node_next = pgdat_list;
 	pgdat_list = pgdat;
@@ -517,7 +548,6 @@
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
-	memlist_init(&lru_cache);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -534,6 +564,8 @@
 	pgdat->node_size = totalpages;
 	pgdat->node_start_paddr = zone_start_paddr;
 	pgdat->node_start_mapnr = (lmem_map - mem_map);
+	pgdat->nr_zones = 0;
+	spin_lock_init(&pgdat->freelist_lock);
 
 	/*
 	 * Initially all pages are reserved - free ones are freed
@@ -548,6 +580,7 @@
 	}
 
 	offset = lmem_map - mem_map;	
+	classzonepages = 0;
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		zone_t *zone = pgdat->node_zones + j;
 		unsigned long mask;
@@ -556,19 +589,22 @@
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
+		classzonepages += realsize;
 
 		printk("zone(%lu): %lu pages.\n", j, size);
 		zone->size = size;
 		zone->name = zone_names[j];
-		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
+		zone->nr_zone = j;
 		zone->free_pages = 0;
+		zone->zone_wake_kswapd = 0;
+		zone->classzone_free_pages = 0;
 		if (!size)
 			continue;
+		pgdat->nr_zones = j+1;
 
 		zone->offset = offset;
-		cumulative += size;
-		mask = (realsize / zone_balance_ratio[j]);
+		mask = (classzonepages / zone_balance_ratio[j]);
 		if (mask < zone_balance_min[j])
 			mask = zone_balance_min[j];
 		else if (mask > zone_balance_max[j])
@@ -576,8 +612,6 @@
 		zone->pages_min = mask;
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
-		zone->low_on_memory = 0;
-		zone->zone_wake_kswapd = 0;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
@@ -606,7 +640,8 @@
 			  (unsigned int *) alloc_bootmem_node(nid, bitmap_size);
 		}
 	}
-	build_zonelists(pgdat);
+	build_gfpmask_zone(pgdat);
+	lru_cache_init(pgdat);
 }
 
 void __init free_area_init(unsigned long *zones_size)
diff -urN 2.4.0-test1-ac7/mm/swap.c 2.4.0-test1-ac7-VM-31/mm/swap.c
--- 2.4.0-test1-ac7/mm/swap.c	Tue Dec  7 15:05:28 1999
+++ 2.4.0-test1-ac7-VM-31/mm/swap.c	Sat Jun  3 14:53:15 2000
@@ -46,13 +46,7 @@
    out, so that we don't try to swap TOO many pages out at once */
 atomic_t nr_async_pages = ATOMIC_INIT(0);
 
-buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
-	60	/* maximum percent buffer */
-};
-
-buffer_mem_t page_cache = {
+buffer_mem_t lru_cache_mem = {
 	2,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
diff -urN 2.4.0-test1-ac7/mm/swap_state.c 2.4.0-test1-ac7-VM-31/mm/swap_state.c
--- 2.4.0-test1-ac7/mm/swap_state.c	Sat Jun  3 14:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/mm/swap_state.c	Sat Jun  3 15:24:36 2000
@@ -47,8 +47,6 @@
 
 void add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
-	unsigned long flags;
-
 #ifdef SWAP_CACHE_INFO
 	swap_cache_add_total++;
 #endif
@@ -58,9 +56,8 @@
 		BUG();
 	if (page->mapping)
 		BUG();
-	flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty));
-	page->flags = flags | (1 << PG_referenced) | (1 << PG_uptodate);
-	add_to_page_cache_locked(page, &swapper_space, entry.val);
+	page->flags &= ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
+	add_to_swap_cache_locked(page, &swapper_space, entry.val);
 }
 
 static inline void remove_from_swap_cache(struct page *page)
@@ -73,7 +70,6 @@
 		PAGE_BUG(page);
 
 	PageClearSwapCache(page);
-	ClearPageDirty(page);
 	remove_inode_page(page);
 }
 
@@ -132,13 +128,22 @@
 	/* 
 	 * If we are the only user, then try to free up the swap cache. 
 	 */
-	if (PageSwapCache(page) && !TryLockPage(page)) {
-		if (!is_page_shared(page)) {
-			delete_from_swap_cache_nolock(page);
+	if (!PageSwapCache(page)) {
+		if (page->map_count)
+			lru_cache_unmap(page, LRU_NORMAL_CACHE);
+	} else {
+		if (page->map_count <= 0)
+			BUG();
+		lru_cache_unmap(page, LRU_SWAP_CACHE);
+
+		if (!TryLockPage(page)) {
+			if (!is_swap_cache_shared(page))
+				delete_from_swap_cache_nolock(page);
+			UnlockPage(page);
 		}
-		UnlockPage(page);
 	}
-	page_cache_release(page);
+
+	__free_page(page);
 }
 
 
@@ -205,7 +210,6 @@
 struct page * read_swap_cache_async(swp_entry_t entry, int wait)
 {
 	struct page *found_page = 0, *new_page;
-	unsigned long new_page_addr;
 	
 	/*
 	 * Make sure the swap entry is still in use.
@@ -219,10 +223,9 @@
 	if (found_page)
 		goto out_free_swap;
 
-	new_page_addr = __get_free_page(GFP_USER);
-	if (!new_page_addr)
+	new_page = alloc_page(GFP_USER);
+	if (!new_page)
 		goto out_free_swap;	/* Out of memory */
-	new_page = mem_map + MAP_NR(new_page_addr);
 
 	/*
 	 * Check the swap cache again, in case we stalled above.
@@ -233,7 +236,8 @@
 	/* 
 	 * Add it to the swap cache and read its contents.
 	 */
-	lock_page(new_page);
+	if (TryLockPage(new_page))
+		BUG();
 	add_to_swap_cache(new_page, entry);
 	rw_swap_page(READ, new_page, wait);
 	return new_page;
diff -urN 2.4.0-test1-ac7/mm/swapfile.c 2.4.0-test1-ac7-VM-31/mm/swapfile.c
--- 2.4.0-test1-ac7/mm/swapfile.c	Fri May 26 22:47:10 2000
+++ 2.4.0-test1-ac7-VM-31/mm/swapfile.c	Sat Jun  3 14:53:15 2000
@@ -230,6 +230,7 @@
 		return;
 	set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	swap_free(entry);
+	lru_cache_map(page);
 	get_page(page);
 	++vma->vm_mm->rss;
 }
@@ -315,10 +316,20 @@
 	 */
 	if (!mm)
 		return;
+	/*
+	 * Avoid the vmas to go away from under us
+	 * and also avoids the task to play with
+	 * pagetables under do_wp_page(). If the
+	 * vmlist_modify_lock wouldn't acquire the
+	 * mm->page_table_lock spinlock we should
+	 * acquire it by hand.
+	 */
+	vmlist_access_lock(mm);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
 		unuse_vma(vma, pgd, entry, page);
 	}
+	vmlist_access_unlock(mm);
 	return;
 }
 
diff -urN 2.4.0-test1-ac7/mm/vmscan.c 2.4.0-test1-ac7-VM-31/mm/vmscan.c
--- 2.4.0-test1-ac7/mm/vmscan.c	Sat Jun  3 14:52:35 2000
+++ 2.4.0-test1-ac7-VM-31/mm/vmscan.c	Sat Jun  3 15:23:41 2000
@@ -34,7 +34,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
 {
 	pte_t pte;
 	swp_entry_t entry;
@@ -48,9 +48,6 @@
 	if ((page-mem_map >= max_mapnr) || PageReserved(page))
 		goto out_failed;
 
-	if (mm->swap_cnt)
-		mm->swap_cnt--;
-
 	/* Don't look at this pte if it's been accessed recently. */
 	if (pte_young(pte)) {
 		/*
@@ -62,10 +59,6 @@
 		goto out_failed;
 	}
 
-	/* Can only do this if we age all active pages. */
-	if (PageActive(page) && page->age > 1)
-		goto out_failed;
-
 	if (TryLockPage(page))
 		goto out_failed;
 
@@ -78,10 +71,9 @@
 	 * memory, and we should just continue our scan.
 	 */
 	if (PageSwapCache(page)) {
-		if (pte_dirty(pte))
-			SetPageDirty(page);
 		entry.val = page->index;
 		swap_duplicate(entry);
+		lru_cache_unmap(page, LRU_SWAP_CACHE);
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
 		UnlockPage(page);
@@ -106,6 +98,8 @@
 	 */
 	if (!pte_dirty(pte)) {
 		flush_cache_page(vma, address);
+		if (page->map_count)
+			lru_cache_unmap(page, LRU_NORMAL_CACHE);
 		pte_clear(page_table);
 		goto drop_pte;
 	}
@@ -119,13 +113,6 @@
 		goto out_unlock;
 
 	/*
-	 * Don't do any of the expensive stuff if
-	 * we're not really interested in this zone.
-	 */
-	if (page->zone->free_pages > page->zone->pages_high)
-		goto out_unlock;
-
-	/*
 	 * Ok, it's really dirty. That means that
 	 * we should either create a new swap cache
 	 * entry for it, or we should write it back
@@ -148,6 +135,8 @@
 	if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) {
 		int error;
 		struct file *file = vma->vm_file;
+		if (page->map_count)
+			lru_cache_unmap(page, LRU_NORMAL_CACHE);
 		if (file) get_file(file);
 		pte_clear(page_table);
 		vma->vm_mm->rss--;
@@ -174,6 +163,11 @@
 
 	if (!(page = prepare_highmem_swapout(page)))
 		goto out_swap_free;
+	if (page->map_count <= 0) {
+		printk("not mapped anonymous page, please report to andrea@suse.de: mapping %p, index %lu, flags %lx, count %d, map_count %d, flags %lx, buffers %p\n", page->mapping, page->index, page->flags, page_count(page), page->map_count, page->flags, page->buffers);
+		BUG();
+	}
+	page_unmap(page);
 
 	swap_duplicate(entry);	/* One for the process, one for the swap cache */
 
@@ -187,10 +181,7 @@
 	vmlist_access_unlock(vma->vm_mm);
 
 	/* OK, do a physical asynchronous write to swap.  */
-	// rw_swap_page(WRITE, page, 0);
-	/* Let shrink_mmap handle this swapout. */
-	SetPageDirty(page);
-	UnlockPage(page);
+	rw_swap_page(WRITE, page, 0);
 
 out_free_success:
 	page_cache_release(page);
@@ -218,7 +209,7 @@
  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
  */
 
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -240,18 +231,16 @@
 	do {
 		int result;
 		vma->vm_mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
+		result = try_to_swap_out(vma, address, pte, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
 	return 0;
 }
 
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -271,18 +260,16 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
 	return 0;
 }
 
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -297,11 +284,9 @@
 	if (address >= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
 		if (result)
 			return result;
-		if (!mm->swap_cnt)
-			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (address && (address < end));
@@ -329,7 +314,7 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(mm, vma, address, gfp_mask);
+			int result = swap_out_vma(vma, address, gfp_mask);
 			if (result)
 				return result;
 			vma = vma->vm_next;
@@ -356,6 +341,7 @@
 	struct task_struct * p;
 	int counter;
 	int __ret = 0;
+	int assign = 0;
 
 	lock_kernel();
 	/* 
@@ -372,7 +358,7 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = (nr_threads << 2) >> (priority >> 2);
+	counter = nr_threads / (priority+1);
 	if (counter < 1)
 		counter = 1;
 
@@ -380,7 +366,6 @@
 		unsigned long max_cnt = 0;
 		struct mm_struct *best = NULL;
 		int pid = 0;
-		int assign = 0;
 	select:
 		read_lock(&tasklist_lock);
 		p = init_task.next_task;
@@ -400,6 +385,8 @@
 			}
 		}
 		read_unlock(&tasklist_lock);
+		if (assign == 1)
+			assign = 2;
 		if (!best) {
 			if (!assign) {
 				assign = 1;
@@ -432,75 +419,118 @@
  * now we need this so that we can do page allocations
  * without holding the kernel lock etc.
  *
- * We want to try to free "count" pages, and we want to 
- * cluster them so that we get good swap-out behaviour.
- *
- * Don't try _too_ hard, though. We don't want to have bad
- * latency.
+ * We want to try to free "count" pages, and we need to 
+ * cluster them so that we get good swap-out behaviour. See
+ * the "free_memory()" macro for details.
  */
-#define FREE_COUNT	8
-#define SWAP_COUNT	16
-static int do_try_to_free_pages(unsigned int gfp_mask)
+static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 {
 	int priority;
-	int count = FREE_COUNT;
-	int swap_count;
+	int count = SWAP_CLUSTER_MAX;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	priority = 64;
+	priority = 6;
 	do {
-		while (shrink_mmap(priority, gfp_mask)) {
+		while (shrink_mmap(priority, zone)) {
 			if (!--count)
 				goto done;
 		}
 
 
+		/*
+		 * don't be too light against the d/i cache since
+		 * shrink_mmap() almost never fail when there's
+		 * really plenty of memory free. 
+		 */
+		count -= shrink_dcache_memory(priority, gfp_mask, zone);
+		count -= shrink_icache_memory(priority, gfp_mask, zone);
+		if (count <= 0)
+			goto done;
+
 		/* Try to get rid of some shared memory pages.. */
-		if (gfp_mask & __GFP_IO) {
-			/*
-			 * don't be too light against the d/i cache since
-		   	 * shrink_mmap() almost never fail when there's
-		   	 * really plenty of memory free. 
-			 */
-			count -= shrink_dcache_memory(priority, gfp_mask);
-			count -= shrink_icache_memory(priority, gfp_mask);
-			if (count <= 0)
+		while (shm_swap(priority, gfp_mask, zone)) {
+			if (!--count)
 				goto done;
-			while (shm_swap(priority, gfp_mask)) {
-				if (!--count)
-					goto done;
-			}
 		}
 
-		/*
-		 * Then, try to page stuff out..
-		 *
-		 * This will not actually free any pages (they get
-		 * put in the swap cache), so we must not count this
-		 * as a "count" success.
-		 */
-		swap_count = SWAP_COUNT;
-		while (swap_out(priority, gfp_mask))
-			if (--swap_count < 0)
-				break;
-
+		/* Then, try to page stuff out.. */
+		while (swap_out(priority, gfp_mask)) {
+			if (!--count)
+				goto done;
+		}
 	} while (--priority >= 0);
+done:
 
-	/* Always end on a shrink_mmap.. */
-	while (shrink_mmap(0, gfp_mask)) {
-		if (!--count)
-			goto done;
+	return priority >= 0;
+}
+
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+
+static int kswapd_work_pgdat(pg_data_t * pgdat)
+{
+	int worked = 0, i;
+	zone_t * zone;
+
+	for (i = pgdat->nr_zones-1; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (current->need_resched)
+			schedule();
+		if (!zone->zone_wake_kswapd)
+			continue;
+		if (!do_try_to_free_pages(GFP_KSWAPD, zone)) {
+			zone->zone_wake_kswapd = 0;
+			continue;
+		}
+		worked = 1;
+	}
+
+	return worked;
+}
+
+static void kswapd_work(void)
+{
+	int worked;
+	pg_data_t * pgdat;
+
+	do {
+		worked = 0;
+		pgdat = pgdat_list;
+		do
+			worked |= kswapd_work_pgdat(pgdat);
+		while ((pgdat = pgdat->node_next));
+	} while (worked);
+}
+
+static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
+{
+	zone_t * zone;
+	int i;
+
+	for (i = pgdat->nr_zones-1; i >= 0; i--) {
+		zone = pgdat->node_zones + i;
+		if (!zone->zone_wake_kswapd)
+			continue;
+		return 0;
 	}
-	/* We return 1 if we are freed some page */
-	return (count != FREE_COUNT);
 
-done:
 	return 1;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+static int kswapd_can_sleep(void)
+{
+	pg_data_t * pgdat;
+
+	pgdat = pgdat_list;
+	do {
+		if (kswapd_can_sleep_pgdat(pgdat))
+			continue;
+		return 0;
+	} while ((pgdat = pgdat->node_next));
+
+	return 1;
+}
 
 /*
  * The background pageout daemon, started as a kernel thread
@@ -518,11 +548,13 @@
 int kswapd(void *unused)
 {
 	struct task_struct *tsk = current;
+	wait_queue_t wait;
 
 	tsk->session = 1;
 	tsk->pgrp = 1;
 	strcpy(tsk->comm, "kswapd");
 	sigfillset(&tsk->blocked);
+	init_waitqueue_entry(&wait, tsk);
 	
 	/*
 	 * Tell the memory management that we're a "memory allocator",
@@ -538,30 +570,23 @@
 	 */
 	tsk->flags |= PF_MEMALLOC;
 
-	for (;;) {
-		pg_data_t *pgdat;
-		int something_to_do = 0;
+	while (1) {
+		/*
+		 * If we actually get into a low-memory situation,
+		 * the processes needing more memory will wake us
+		 * up on a more timely basis.
+		 */
+		kswapd_work();
+		run_task_queue(&tq_disk);
 
-		pgdat = pgdat_list;
-		do {
-			int i;
-			for(i = 0; i < MAX_NR_ZONES; i++) {
-				zone_t *zone = pgdat->node_zones+ i;
-				if (tsk->need_resched)
-					schedule();
-				if (!zone->size || !zone->zone_wake_kswapd)
-					continue;
-				if (zone->free_pages < zone->pages_low)
-					something_to_do = 1;
-				do_try_to_free_pages(GFP_KSWAPD);
-			}
-			pgdat = pgdat->node_next;
-		} while (pgdat);
+		__set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kswapd_wait, &wait);
 
-		if (!something_to_do) {
-			tsk->state = TASK_INTERRUPTIBLE;
-			interruptible_sleep_on(&kswapd_wait);
-		}
+		if (kswapd_can_sleep())
+			schedule();
+
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kswapd_wait, &wait);
 	}
 }
 
@@ -580,13 +605,13 @@
  * can be done by just dropping cached pages without having
  * any deadlock issues.
  */
-int try_to_free_pages(unsigned int gfp_mask)
+int try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 {
 	int retval = 1;
 
 	if (gfp_mask & __GFP_WAIT) {
 		current->flags |= PF_MEMALLOC;
-		retval = do_try_to_free_pages(gfp_mask);
+		retval = do_try_to_free_pages(gfp_mask, zone);
 		current->flags &= ~PF_MEMALLOC;
 	}
 	return retval;