4 files changed, 884 insertions, 0 deletions
diff --git a/arch/alpha/mm/Makefile b/arch/alpha/mm/Makefile
new file mode 100644
index 000000000..5a9807936
--- /dev/null
+++ b/arch/alpha/mm/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the linux alpha-specific parts of the memory manager.
+#
+
+ccflags-y := -Werror
+
+obj-y	:= init.o fault.o
+
+obj-$(CONFIG_DISCONTIGMEM) += numa.o
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
new file mode 100644
index 000000000..188fc9256
--- /dev/null
+++ b/arch/alpha/mm/fault.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  linux/arch/alpha/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ */
+
+#include <linux/sched/signal.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/io.h>
+
+#define __EXTERN_INLINE inline
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#undef  __EXTERN_INLINE
+
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/extable.h>
+#include <linux/uaccess.h>
+
+extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
+
+
+/*
+ * Force a new ASN for a task.
+ */
+
+#ifndef CONFIG_SMP
+unsigned long last_asn = ASN_FIRST_VERSION;
+#endif
+
+void
+__load_new_mm_context(struct mm_struct *next_mm)
+{
+	unsigned long mmc;
+	struct pcb_struct *pcb;
+
+	mmc = __get_new_mm_context(next_mm, smp_processor_id());
+	next_mm->context[smp_processor_id()] = mmc;
+
+	pcb = &current_thread_info()->pcb;
+	pcb->asn = mmc & HARDWARE_ASN_MASK;
+	pcb->ptbr = ((unsigned long) next_mm->pgd - IDENT_ADDR) >> PAGE_SHIFT;
+
+	__reload_thread(pcb);
+}
+
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to handle_mm_fault().
+ *
+ * mmcsr:
+ *	0 = translation not valid
+ *	1 = access violation
+ *	2 = fault-on-read
+ *	3 = fault-on-execute
+ *	4 = fault-on-write
+ *
+ * cause:
+ *	-1 = instruction fetch
+ *	0 = load
+ *	1 = store
+ *
+ * Registers $9 through $15 are saved in a block just prior to `regs' and
+ * are saved and restored around the call to allow exception code to
+ * modify them.
+ */
+
+/* Macro for exception fixup code to access integer registers.  */
+#define dpf_reg(r)							\
+	(((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 :	\
+				 (r) <= 18 ? (r)+10 : (r)-10])
+
+asmlinkage void
+do_page_fault(unsigned long address, unsigned long mmcsr,
+	      long cause, struct pt_regs *regs)
+{
+	struct vm_area_struct * vma;
+	struct mm_struct *mm = current->mm;
+	const struct exception_table_entry *fixup;
+	int si_code = SEGV_MAPERR;
+	vm_fault_t fault;
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+	/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
+	   (or is suppressed by the PALcode).  Support that for older CPUs
+	   by ignoring such an instruction.  */
+	if (cause == 0) {
+		unsigned int insn;
+		__get_user(insn, (unsigned int __user *)regs->pc);
+		if ((insn >> 21 & 0x1f) == 0x1f &&
+		    /* ldq ldl ldt lds ldg ldf ldwu ldbu */
+		    (1ul << (insn >> 26) & 0x30f00001400ul)) {
+			regs->pc += 4;
+			return;
+		}
+	}
+
+	/* If we're in an interrupt context, or have no user context,
+	   we must not take the fault.  */
+	if (!mm || faulthandler_disabled())
+		goto no_context;
+
+#ifdef CONFIG_ALPHA_LARGE_VMALLOC
+	if (address >= TASK_SIZE)
+		goto vmalloc_fault;
+#endif
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+retry:
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, address);
+	if (!vma)
+		goto bad_area;
+	if (vma->vm_start <= address)
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+	if (expand_stack(vma, address))
+		goto bad_area;
+
+	/* Ok, we have a good vm_area for this memory access, so
+	   we can handle it.  */
+ good_area:
+	si_code = SEGV_ACCERR;
+	if (cause < 0) {
+		if (!(vma->vm_flags & VM_EXEC))
+			goto bad_area;
+	} else if (!cause) {
+		/* Allow reads even for write-only mappings */
+		if (!(vma->vm_flags & (VM_READ | VM_WRITE)))
+			goto bad_area;
+	} else {
+		if (!(vma->vm_flags & VM_WRITE))
+			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
+	}
+
+	/* If for any reason at all we couldn't handle the fault,
+	   make sure we exit gracefully rather than endlessly redo
+	   the fault.  */
+	fault = handle_mm_fault(vma, address, flags);
+
+	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
+		return;
+
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
+	}
+
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR)
+			current->maj_flt++;
+		else
+			current->min_flt++;
+		if (fault & VM_FAULT_RETRY) {
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+
+			 /* No need to up_read(&mm->mmap_sem) as we would
+			 * have already released it in __lock_page_or_retry
+			 * in mm/filemap.c.
+			 */
+
+			goto retry;
+		}
+	}
+
+	up_read(&mm->mmap_sem);
+
+	return;
+
+	/* Something tried to access memory that isn't in our memory map.
+	   Fix it, but check if it's kernel or user first.  */
+ bad_area:
+	up_read(&mm->mmap_sem);
+
+	if (user_mode(regs))
+		goto do_sigsegv;
+
+ no_context:
+	/* Are we prepared to handle this fault as an exception?  */
+	if ((fixup = search_exception_tables(regs->pc)) != 0) {
+		unsigned long newpc;
+		newpc = fixup_exception(dpf_reg, fixup, regs->pc);
+		regs->pc = newpc;
+		return;
+	}
+
+	/* Oops. The kernel tried to access some bad page. We'll have to
+	   terminate things with extreme prejudice.  */
+	printk(KERN_ALERT "Unable to handle kernel paging request at "
+	       "virtual address %016lx\n", address);
+	die_if_kernel("Oops", regs, cause, (unsigned long*)regs - 16);
+	do_exit(SIGKILL);
+
+	/* We ran out of memory, or some other thing happened to us that
+	   made us unable to handle the page fault gracefully.  */
+ out_of_memory:
+	up_read(&mm->mmap_sem);
+	if (!user_mode(regs))
+		goto no_context;
+	pagefault_out_of_memory();
+	return;
+
+ do_sigbus:
+	up_read(&mm->mmap_sem);
+	/* Send a sigbus, regardless of whether we were in kernel
+	   or user mode.  */
+	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0, current);
+	if (!user_mode(regs))
+		goto no_context;
+	return;
+
+ do_sigsegv:
+	force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0, current);
+	return;
+
+#ifdef CONFIG_ALPHA_LARGE_VMALLOC
+ vmalloc_fault:
+	if (user_mode(regs))
+		goto do_sigsegv;
+	else {
+		/* Synchronize this task's top level page-table
+		   with the "reference" page table from init.  */
+		long index = pgd_index(address);
+		pgd_t *pgd, *pgd_k;
+
+		pgd = current->active_mm->pgd + index;
+		pgd_k = swapper_pg_dir + index;
+		if (!pgd_present(*pgd) && pgd_present(*pgd_k)) {
+			pgd_val(*pgd) = pgd_val(*pgd_k);
+			return;
+		}
+		goto no_context;
+	}
+#endif
+}
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
new file mode 100644
index 000000000..9d7452029
--- /dev/null
+++ b/arch/alpha/mm/init.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  linux/arch/alpha/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ */
+
+/* 2.3.x zone allocator, 1999 Andrea Arcangeli <andrea@suse.de> */
+
+#include <linux/pagemap.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/init.h>
+#include <linux/bootmem.h> /* max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/gfp.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/hwrpb.h>
+#include <asm/dma.h>
+#include <asm/mmu_context.h>
+#include <asm/console.h>
+#include <asm/tlb.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+
+extern void die_if_kernel(char *,struct pt_regs *,long);
+
+static struct pcb_struct original_pcb;
+
+pgd_t *
+pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *ret, *init;
+
+	ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	init = pgd_offset(&init_mm, 0UL);
+	if (ret) {
+#ifdef CONFIG_ALPHA_LARGE_VMALLOC
+		memcpy (ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
+			(PTRS_PER_PGD - USER_PTRS_PER_PGD - 1)*sizeof(pgd_t));
+#else
+		pgd_val(ret[PTRS_PER_PGD-2]) = pgd_val(init[PTRS_PER_PGD-2]);
+#endif
+
+		/* The last PGD entry is the VPTB self-map.  */
+		pgd_val(ret[PTRS_PER_PGD-1])
+		  = pte_val(mk_pte(virt_to_page(ret), PAGE_KERNEL));
+	}
+	return ret;
+}
+
+
+/*
+ * BAD_PAGE is the page that is used for page faults when linux
+ * is out-of-memory. Older versions of linux just did a
+ * do_exit(), but using this instead means there is less risk
+ * for a process dying in kernel mode, possibly leaving an inode
+ * unused etc..
+ *
+ * BAD_PAGETABLE is the accompanying page-table: it is initialized
+ * to point to BAD_PAGE entries.
+ *
+ * ZERO_PAGE is a special page that is used for zero-initialized
+ * data and COW.
+ */
+pmd_t *
+__bad_pagetable(void)
+{
+	memset((void *) EMPTY_PGT, 0, PAGE_SIZE);
+	return (pmd_t *) EMPTY_PGT;
+}
+
+pte_t
+__bad_page(void)
+{
+	memset((void *) EMPTY_PGE, 0, PAGE_SIZE);
+	return pte_mkdirty(mk_pte(virt_to_page(EMPTY_PGE), PAGE_SHARED));
+}
+
+static inline unsigned long
+load_PCB(struct pcb_struct *pcb)
+{
+	register unsigned long sp __asm__("$30");
+	pcb->ksp = sp;
+	return __reload_thread(pcb);
+}
+
+/* Set up initial PCB, VPTB, and other such nicities.  */
+
+static inline void
+switch_to_system_map(void)
+{
+	unsigned long newptbr;
+	unsigned long original_pcb_ptr;
+
+	/* Initialize the kernel's page tables.  Linux puts the vptb in
+	   the last slot of the L1 page table.  */
+	memset(swapper_pg_dir, 0, PAGE_SIZE);
+	newptbr = ((unsigned long) swapper_pg_dir - PAGE_OFFSET) >> PAGE_SHIFT;
+	pgd_val(swapper_pg_dir[1023]) =
+		(newptbr << 32) | pgprot_val(PAGE_KERNEL);
+
+	/* Set the vptb.  This is often done by the bootloader, but 
+	   shouldn't be required.  */
+	if (hwrpb->vptb != 0xfffffffe00000000UL) {
+		wrvptptr(0xfffffffe00000000UL);
+		hwrpb->vptb = 0xfffffffe00000000UL;
+		hwrpb_update_checksum(hwrpb);
+	}
+
+	/* Also set up the real kernel PCB while we're at it.  */
+	init_thread_info.pcb.ptbr = newptbr;
+	init_thread_info.pcb.flags = 1;	/* set FEN, clear everything else */
+	original_pcb_ptr = load_PCB(&init_thread_info.pcb);
+	tbia();
+
+	/* Save off the contents of the original PCB so that we can
+	   restore the original console's page tables for a clean reboot.
+
+	   Note that the PCB is supposed to be a physical address, but
+	   since KSEG values also happen to work, folks get confused.
+	   Check this here.  */
+
+	if (original_pcb_ptr < PAGE_OFFSET) {
+		original_pcb_ptr = (unsigned long)
+			phys_to_virt(original_pcb_ptr);
+	}
+	original_pcb = *(struct pcb_struct *) original_pcb_ptr;
+}
+
+int callback_init_done;
+
+void * __init
+callback_init(void * kernel_end)
+{
+	struct crb_struct * crb;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	void *two_pages;
+
+	/* Starting at the HWRPB, locate the CRB. */
+	crb = (struct crb_struct *)((char *)hwrpb + hwrpb->crb_offset);
+
+	if (alpha_using_srm) {
+		/* Tell the console whither it is to be remapped. */
+		if (srm_fixup(VMALLOC_START, (unsigned long)hwrpb))
+			__halt();		/* "We're boned."  --Bender */
+
+		/* Edit the procedure descriptors for DISPATCH and FIXUP. */
+		crb->dispatch_va = (struct procdesc_struct *)
+			(VMALLOC_START + (unsigned long)crb->dispatch_va
+			 - crb->map[0].va);
+		crb->fixup_va = (struct procdesc_struct *)
+			(VMALLOC_START + (unsigned long)crb->fixup_va
+			 - crb->map[0].va);
+	}
+
+	switch_to_system_map();
+
+	/* Allocate one PGD and one PMD.  In the case of SRM, we'll need
+	   these to actually remap the console.  There is an assumption
+	   here that only one of each is needed, and this allows for 8MB.
+	   On systems with larger consoles, additional pages will be
+	   allocated as needed during the mapping process.
+
+	   In the case of not SRM, but not CONFIG_ALPHA_LARGE_VMALLOC,
+	   we need to allocate the PGD we use for vmalloc before we start
+	   forking other tasks.  */
+
+	two_pages = (void *)
+	  (((unsigned long)kernel_end + ~PAGE_MASK) & PAGE_MASK);
+	kernel_end = two_pages + 2*PAGE_SIZE;
+	memset(two_pages, 0, 2*PAGE_SIZE);
+
+	pgd = pgd_offset_k(VMALLOC_START);
+	pgd_set(pgd, (pmd_t *)two_pages);
+	pmd = pmd_offset(pgd, VMALLOC_START);
+	pmd_set(pmd, (pte_t *)(two_pages + PAGE_SIZE));
+
+	if (alpha_using_srm) {
+		static struct vm_struct console_remap_vm;
+		unsigned long nr_pages = 0;
+		unsigned long vaddr;
+		unsigned long i, j;
+
+		/* calculate needed size */
+		for (i = 0; i < crb->map_entries; ++i)
+			nr_pages += crb->map[i].count;
+
+		/* register the vm area */
+		console_remap_vm.flags = VM_ALLOC;
+		console_remap_vm.size = nr_pages << PAGE_SHIFT;
+		vm_area_register_early(&console_remap_vm, PAGE_SIZE);
+
+		vaddr = (unsigned long)console_remap_vm.addr;
+
+		/* Set up the third level PTEs and update the virtual
+		   addresses of the CRB entries.  */
+		for (i = 0; i < crb->map_entries; ++i) {
+			unsigned long pfn = crb->map[i].pa >> PAGE_SHIFT;
+			crb->map[i].va = vaddr;
+			for (j = 0; j < crb->map[i].count; ++j) {
+				/* Newer consoles (especially on larger
+				   systems) may require more pages of
+				   PTEs. Grab additional pages as needed. */
+				if (pmd != pmd_offset(pgd, vaddr)) {
+					memset(kernel_end, 0, PAGE_SIZE);
+					pmd = pmd_offset(pgd, vaddr);
+					pmd_set(pmd, (pte_t *)kernel_end);
+					kernel_end += PAGE_SIZE;
+				}
+				set_pte(pte_offset_kernel(pmd, vaddr),
+					pfn_pte(pfn, PAGE_KERNEL));
+				pfn++;
+				vaddr += PAGE_SIZE;
+			}
+		}
+	}
+
+	callback_init_done = 1;
+	return kernel_end;
+}
+
+
+#ifndef CONFIG_DISCONTIGMEM
+/*
+ * paging_init() sets up the memory map.
+ */
+void __init paging_init(void)
+{
+	unsigned long zones_size[MAX_NR_ZONES] = {0, };
+	unsigned long dma_pfn, high_pfn;
+
+	dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	high_pfn = max_pfn = max_low_pfn;
+
+	if (dma_pfn >= high_pfn)
+		zones_size[ZONE_DMA] = high_pfn;
+	else {
+		zones_size[ZONE_DMA] = dma_pfn;
+		zones_size[ZONE_NORMAL] = high_pfn - dma_pfn;
+	}
+
+	/* Initialize mem_map[].  */
+	free_area_init(zones_size);
+
+	/* Initialize the kernel's ZERO_PGE. */
+	memset((void *)ZERO_PGE, 0, PAGE_SIZE);
+}
+#endif /* CONFIG_DISCONTIGMEM */
+
+#if defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_SRM)
+void
+srm_paging_stop (void)
+{
+	/* Move the vptb back to where the SRM console expects it.  */
+	swapper_pg_dir[1] = swapper_pg_dir[1023];
+	tbia();
+	wrvptptr(0x200000000UL);
+	hwrpb->vptb = 0x200000000UL;
+	hwrpb_update_checksum(hwrpb);
+
+	/* Reload the page tables that the console had in use.  */
+	load_PCB(&original_pcb);
+	tbia();
+}
+#endif
+
+void __init
+mem_init(void)
+{
+	set_max_mapnr(max_low_pfn);
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+	free_all_bootmem();
+	mem_init_print_info(NULL);
+}
+
+void
+free_initmem(void)
+{
+	free_initmem_default(-1);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void
+free_initrd_mem(unsigned long start, unsigned long end)
+{
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
+}
+#endif
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
new file mode 100644
index 000000000..a9e86475f
--- /dev/null
+++ b/arch/alpha/mm/numa.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  linux/arch/alpha/mm/numa.c
+ *
+ *  DISCONTIGMEM NUMA alpha support.
+ *
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/swap.h>
+#include <linux/initrd.h>
+#include <linux/pfn.h>
+#include <linux/module.h>
+
+#include <asm/hwrpb.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+
+pg_data_t node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+#undef DEBUG_DISCONTIG
+#ifdef DEBUG_DISCONTIG
+#define DBGDCONT(args...) printk(args)
+#else
+#define DBGDCONT(args...)
+#endif
+
+#define for_each_mem_cluster(memdesc, _cluster, i)		\
+	for ((_cluster) = (memdesc)->cluster, (i) = 0;		\
+	     (i) < (memdesc)->numclusters; (i)++, (_cluster)++)
+
+static void __init show_mem_layout(void)
+{
+	struct memclust_struct * cluster;
+	struct memdesc_struct * memdesc;
+	int i;
+
+	/* Find free clusters, and init and free the bootmem accordingly.  */
+	memdesc = (struct memdesc_struct *)
+	  (hwrpb->mddt_offset + (unsigned long) hwrpb);
+
+	printk("Raw memory layout:\n");
+	for_each_mem_cluster(memdesc, cluster, i) {
+		printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n",
+		       i, cluster->usage, cluster->start_pfn,
+		       cluster->start_pfn + cluster->numpages);
+	}
+}
+
+static void __init
+setup_memory_node(int nid, void *kernel_end)
+{
+	extern unsigned long mem_size_limit;
+	struct memclust_struct * cluster;
+	struct memdesc_struct * memdesc;
+	unsigned long start_kernel_pfn, end_kernel_pfn;
+	unsigned long bootmap_size, bootmap_pages, bootmap_start;
+	unsigned long start, end;
+	unsigned long node_pfn_start, node_pfn_end;
+	unsigned long node_min_pfn, node_max_pfn;
+	int i;
+	unsigned long node_datasz = PFN_UP(sizeof(pg_data_t));
+	int show_init = 0;
+
+	/* Find the bounds of current node */
+	node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT;
+	node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT);
+	
+	/* Find free clusters, and init and free the bootmem accordingly.  */
+	memdesc = (struct memdesc_struct *)
+	  (hwrpb->mddt_offset + (unsigned long) hwrpb);
+
+	/* find the bounds of this node (node_min_pfn/node_max_pfn) */
+	node_min_pfn = ~0UL;
+	node_max_pfn = 0UL;
+	for_each_mem_cluster(memdesc, cluster, i) {
+		/* Bit 0 is console/PALcode reserved.  Bit 1 is
+		   non-volatile memory -- we might want to mark
+		   this for later.  */
+		if (cluster->usage & 3)
+			continue;
+
+		start = cluster->start_pfn;
+		end = start + cluster->numpages;
+
+		if (start >= node_pfn_end || end <= node_pfn_start)
+			continue;
+
+		if (!show_init) {
+			show_init = 1;
+			printk("Initializing bootmem allocator on Node ID %d\n", nid);
+		}
+		printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n",
+		       i, cluster->usage, cluster->start_pfn,
+		       cluster->start_pfn + cluster->numpages);
+
+		if (start < node_pfn_start)
+			start = node_pfn_start;
+		if (end > node_pfn_end)
+			end = node_pfn_end;
+
+		if (start < node_min_pfn)
+			node_min_pfn = start;
+		if (end > node_max_pfn)
+			node_max_pfn = end;
+	}
+
+	if (mem_size_limit && node_max_pfn > mem_size_limit) {
+		static int msg_shown = 0;
+		if (!msg_shown) {
+			msg_shown = 1;
+			printk("setup: forcing memory size to %ldK (from %ldK).\n",
+			       mem_size_limit << (PAGE_SHIFT - 10),
+			       node_max_pfn    << (PAGE_SHIFT - 10));
+		}
+		node_max_pfn = mem_size_limit;
+	}
+
+	if (node_min_pfn >= node_max_pfn)
+		return;
+
+	/* Update global {min,max}_low_pfn from node information. */
+	if (node_min_pfn < min_low_pfn)
+		min_low_pfn = node_min_pfn;
+	if (node_max_pfn > max_low_pfn)
+		max_pfn = max_low_pfn = node_max_pfn;
+
+#if 0 /* we'll try this one again in a little while */
+	/* Cute trick to make sure our local node data is on local memory */
+	node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT));
+#endif
+	/* Quasi-mark the pg_data_t as in-use */
+	node_min_pfn += node_datasz;
+	if (node_min_pfn >= node_max_pfn) {
+		printk(" not enough mem to reserve NODE_DATA");
+		return;
+	}
+	NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+
+	printk(" Detected node memory:   start %8lu, end %8lu\n",
+	       node_min_pfn, node_max_pfn);
+
+	DBGDCONT(" DISCONTIG: node_data[%d]   is at 0x%p\n", nid, NODE_DATA(nid));
+	DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata);
+
+	/* Find the bounds of kernel memory.  */
+	start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
+	end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
+	bootmap_start = -1;
+
+	if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn))
+		panic("kernel loaded out of ram");
+
+	/* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned.
+	   Note that we round this down, not up - node memory
+	   has much larger alignment than 8Mb, so it's safe. */
+	node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1);
+
+	/* We need to know how many physically contiguous pages
+	   we'll need for the bootmap.  */
+	bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn);
+
+	/* Now find a good region where to allocate the bootmap.  */
+	for_each_mem_cluster(memdesc, cluster, i) {
+		if (cluster->usage & 3)
+			continue;
+
+		start = cluster->start_pfn;
+		end = start + cluster->numpages;
+
+		if (start >= node_max_pfn || end <= node_min_pfn)
+			continue;
+
+		if (end > node_max_pfn)
+			end = node_max_pfn;
+		if (start < node_min_pfn)
+			start = node_min_pfn;
+
+		if (start < start_kernel_pfn) {
+			if (end > end_kernel_pfn
+			    && end - end_kernel_pfn >= bootmap_pages) {
+				bootmap_start = end_kernel_pfn;
+				break;
+			} else if (end > start_kernel_pfn)
+				end = start_kernel_pfn;
+		} else if (start < end_kernel_pfn)
+			start = end_kernel_pfn;
+		if (end - start >= bootmap_pages) {
+			bootmap_start = start;
+			break;
+		}
+	}
+
+	if (bootmap_start == -1)
+		panic("couldn't find a contiguous place for the bootmap");
+
+	/* Allocate the bootmap and mark the whole MM as reserved.  */
+	bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start,
+					 node_min_pfn, node_max_pfn);
+	DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n",
+		 bootmap_start, bootmap_size, bootmap_pages);
+
+	/* Mark the free regions.  */
+	for_each_mem_cluster(memdesc, cluster, i) {
+		if (cluster->usage & 3)
+			continue;
+
+		start = cluster->start_pfn;
+		end = cluster->start_pfn + cluster->numpages;
+
+		if (start >= node_max_pfn || end <= node_min_pfn)
+			continue;
+
+		if (end > node_max_pfn)
+			end = node_max_pfn;
+		if (start < node_min_pfn)
+			start = node_min_pfn;
+
+		if (start < start_kernel_pfn) {
+			if (end > end_kernel_pfn) {
+				free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start),
+					     (PFN_PHYS(start_kernel_pfn)
+					      - PFN_PHYS(start)));
+				printk(" freeing pages %ld:%ld\n",
+				       start, start_kernel_pfn);
+				start = end_kernel_pfn;
+			} else if (end > start_kernel_pfn)
+				end = start_kernel_pfn;
+		} else if (start < end_kernel_pfn)
+			start = end_kernel_pfn;
+		if (start >= end)
+			continue;
+
+		free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
+		printk(" freeing pages %ld:%ld\n", start, end);
+	}
+
+	/* Reserve the bootmap memory.  */
+	reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start),
+			bootmap_size, BOOTMEM_DEFAULT);
+	printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
+
+	node_set_online(nid);
+}
+
+void __init
+setup_memory(void *kernel_end)
+{
+	int nid;
+
+	show_mem_layout();
+
+	nodes_clear(node_online_map);
+
+	min_low_pfn = ~0UL;
+	max_low_pfn = 0UL;
+	for (nid = 0; nid < MAX_NUMNODES; nid++)
+		setup_memory_node(nid, kernel_end);
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	initrd_start = INITRD_START;
+	if (initrd_start) {
+		extern void *move_initrd(unsigned long);
+
+		initrd_end = initrd_start+INITRD_SIZE;
+		printk("Initial ramdisk at: 0x%p (%lu bytes)\n",
+		       (void *) initrd_start, INITRD_SIZE);
+
+		if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) {
+			if (!move_initrd(PFN_PHYS(max_low_pfn)))
+				printk("initrd extends beyond end of memory "
+				       "(0x%08lx > 0x%p)\ndisabling initrd\n",
+				       initrd_end,
+				       phys_to_virt(PFN_PHYS(max_low_pfn)));
+		} else {
+			nid = kvaddr_to_nid(initrd_start);
+			reserve_bootmem_node(NODE_DATA(nid),
+					     virt_to_phys((void *)initrd_start),
+					     INITRD_SIZE, BOOTMEM_DEFAULT);
+		}
+	}
+#endif /* CONFIG_BLK_DEV_INITRD */
+}
+
+void __init paging_init(void)
+{
+	unsigned int    nid;
+	unsigned long   zones_size[MAX_NR_ZONES] = {0, };
+	unsigned long	dma_local_pfn;
+
+	/*
+	 * The old global MAX_DMA_ADDRESS per-arch API doesn't fit
+	 * in the NUMA model, for now we convert it to a pfn and
+	 * we interpret this pfn as a local per-node information.
+	 * This issue isn't very important since none of these machines
+	 * have legacy ISA slots anyways.
+	 */
+	dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+	for_each_online_node(nid) {
+		bootmem_data_t *bdata = &bootmem_node_data[nid];
+		unsigned long start_pfn = bdata->node_min_pfn;
+		unsigned long end_pfn = bdata->node_low_pfn;
+
+		if (dma_local_pfn >= end_pfn - start_pfn)
+			zones_size[ZONE_DMA] = end_pfn - start_pfn;
+		else {
+			zones_size[ZONE_DMA] = dma_local_pfn;
+			zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn;
+		}
+		node_set_state(nid, N_NORMAL_MEMORY);
+		free_area_init_node(nid, zones_size, start_pfn, NULL);
+	}
+
+	/* Initialize the kernel's ZERO_PGE. */
+	memset((void *)ZERO_PGE, 0, PAGE_SIZE);
+}