Module Name:    src
Committed By:   ryo
Date:           Sun Aug 26 18:15:50 UTC 2018

Modified Files:
        src/sys/arch/aarch64/aarch64: aarch64_machdep.c cpu.c cpufunc.c
            genassym.cf locore.S
        src/sys/arch/aarch64/include: cpu.h cpufunc.h
        src/sys/arch/arm/broadcom: bcm283x_platform.c
        src/sys/arch/arm/fdt: cpu_fdt.c psci_fdt.c

Log Message:
add support multiple cpu clusters.
* pass cpu index as an argument to secondary processors when hatching.
* keep cpu cache confituration per cpu clusters.

Hello big.LITTLE!


To generate a diff of this commit:
cvs rdiff -u -r1.10 -r1.11 src/sys/arch/aarch64/aarch64/aarch64_machdep.c
cvs rdiff -u -r1.5 -r1.6 src/sys/arch/aarch64/aarch64/cpu.c
cvs rdiff -u -r1.2 -r1.3 src/sys/arch/aarch64/aarch64/cpufunc.c
cvs rdiff -u -r1.6 -r1.7 src/sys/arch/aarch64/aarch64/genassym.cf
cvs rdiff -u -r1.19 -r1.20 src/sys/arch/aarch64/aarch64/locore.S
cvs rdiff -u -r1.6 -r1.7 src/sys/arch/aarch64/include/cpu.h
cvs rdiff -u -r1.2 -r1.3 src/sys/arch/aarch64/include/cpufunc.h
cvs rdiff -u -r1.14 -r1.15 src/sys/arch/arm/broadcom/bcm283x_platform.c
cvs rdiff -u -r1.8 -r1.9 src/sys/arch/arm/fdt/cpu_fdt.c
cvs rdiff -u -r1.14 -r1.15 src/sys/arch/arm/fdt/psci_fdt.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/aarch64/aarch64/aarch64_machdep.c
diff -u src/sys/arch/aarch64/aarch64/aarch64_machdep.c:1.10 src/sys/arch/aarch64/aarch64/aarch64_machdep.c:1.11
--- src/sys/arch/aarch64/aarch64/aarch64_machdep.c:1.10	Fri Aug 24 01:59:40 2018
+++ src/sys/arch/aarch64/aarch64/aarch64_machdep.c	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: aarch64_machdep.c,v 1.10 2018/08/24 01:59:40 jmcneill Exp $ */
+/* $NetBSD: aarch64_machdep.c,v 1.11 2018/08/26 18:15:49 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aarch64_machdep.c,v 1.10 2018/08/24 01:59:40 jmcneill Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aarch64_machdep.c,v 1.11 2018/08/26 18:15:49 ryo Exp $");
 
 #include "opt_arm_debug.h"
 #include "opt_ddb.h"
@@ -168,8 +168,6 @@ initarm_common(vaddr_t kvm_base, vsize_t
 	vaddr_t kernelvmstart;
 	int i;
 
-	aarch64_getcacheinfo();
-
 	cputype = cpu_idnum();	/* for compatible arm */
 
 	kernstart = trunc_page((vaddr_t)__kernel_text);

Index: src/sys/arch/aarch64/aarch64/cpu.c
diff -u src/sys/arch/aarch64/aarch64/cpu.c:1.5 src/sys/arch/aarch64/aarch64/cpu.c:1.6
--- src/sys/arch/aarch64/aarch64/cpu.c:1.5	Mon Aug 20 18:13:56 2018
+++ src/sys/arch/aarch64/aarch64/cpu.c	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.c,v 1.5 2018/08/20 18:13:56 jmcneill Exp $ */
+/* $NetBSD: cpu.c,v 1.6 2018/08/26 18:15:49 ryo Exp $ */
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <r...@nerv.org>
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.5 2018/08/20 18:13:56 jmcneill Exp $");
+__KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.6 2018/08/26 18:15:49 ryo Exp $");
 
 #include "locators.h"
 #include "opt_arm_debug.h"
@@ -58,70 +58,72 @@ __KERNEL_RCSID(1, "$NetBSD: cpu.c,v 1.5 
 
 void cpu_attach(device_t, cpuid_t);
 static void identify_aarch64_model(uint32_t, char *, size_t);
-static void cpu_identify(device_t self, struct cpu_info *, uint32_t, uint64_t);
+static void cpu_identify(device_t self, struct cpu_info *);
 static void cpu_identify1(device_t self, struct cpu_info *);
 static void cpu_identify2(device_t self, struct cpu_info *);
 
 #ifdef MULTIPROCESSOR
 volatile u_int arm_cpu_hatched __cacheline_aligned = 0;
+volatile u_int arm_cpu_hatch_arg __cacheline_aligned;
 volatile uint32_t arm_cpu_mbox __cacheline_aligned = 0;
 u_int arm_cpu_max = 1;
 
-/* stored by secondary processors (available when arm_cpu_hatched) */
-uint32_t cpus_midr[MAXCPUS];
-uint64_t cpus_mpidr[MAXCPUS];
-
 static kmutex_t cpu_hatch_lock;
 #endif /* MULTIPROCESSOR */
 
-/* Our exported CPU info; we can have only one. */
-struct cpu_info cpu_info_store __cacheline_aligned = {
-	.ci_cpl = IPL_HIGH,
-	.ci_curlwp = &lwp0
-};
-
 #ifdef MULTIPROCESSOR
 #define NCPUINFO	MAXCPUS
 #else
 #define NCPUINFO	1
 #endif /* MULTIPROCESSOR */
 
-struct cpu_info *cpu_info[NCPUINFO] = {
-	[0] = &cpu_info_store
+/*
+ * Our exported CPU info;
+ * these will be refered from secondary cpus in the middle of hatching.
+ */
+struct cpu_info cpu_info_store[NCPUINFO] = {
+	[0] = {
+		.ci_cpl = IPL_HIGH,
+		.ci_curlwp = &lwp0
+	}
+};
+
+struct cpu_info *cpu_info[NCPUINFO] __read_mostly = {
+	[0] = &cpu_info_store[0]
 };
 
 void
 cpu_attach(device_t dv, cpuid_t id)
 {
 	struct cpu_info *ci;
+	const int unit = device_unit(dv);
 	uint64_t mpidr;
-	uint32_t midr;
 
-	if (id == 0) {
+	if (unit == 0) {
 		ci = curcpu();
-		midr = reg_midr_el1_read();
-		mpidr = reg_mpidr_el1_read();
+		ci->ci_cpuid = id;
+		cpu_info_store[unit].ci_midr = reg_midr_el1_read();
+		cpu_info_store[unit].ci_mpidr = reg_mpidr_el1_read();
 	} else {
 #ifdef MULTIPROCESSOR
-		KASSERT(cpu_info[id] == NULL);
-		ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
+		KASSERT(unit < MAXCPUS);
+		ci = &cpu_info_store[unit];
+
 		ci->ci_cpl = IPL_HIGH;
 		ci->ci_cpuid = id;
+		ci->ci_data.cpu_cc_freq = cpu_info_store[0].ci_data.cpu_cc_freq;
+		/* ci_{midr,mpidr} are stored by own cpus when hatching */
 
-		ci->ci_data.cpu_cc_freq = cpu_info[0]->ci_data.cpu_cc_freq;
-		cpu_info[ci->ci_cpuid] = ci;
-		if ((arm_cpu_hatched & (1 << id)) == 0) {
+		cpu_info[ncpu] = ci;
+		if ((arm_cpu_hatched & __BIT(unit)) == 0) {
 			ci->ci_dev = dv;
 			dv->dv_private = ci;
+			ci->ci_index = -1;
 
 			aprint_naive(": disabled\n");
 			aprint_normal(": disabled (unresponsive)\n");
 			return;
 		}
-
-		/* cpus_{midr,mpidr}[id] is stored by secondary processor */
-		midr = cpus_midr[id];
-		mpidr = cpus_mpidr[id];
 #else /* MULTIPROCESSOR */
 		aprint_naive(": disabled\n");
 		aprint_normal(": disabled (uniprocessor kernel)\n");
@@ -129,6 +131,7 @@ cpu_attach(device_t dv, cpuid_t id)
 #endif /* MULTIPROCESSOR */
 	}
 
+	mpidr = ci->ci_mpidr;
 	if (mpidr & MPIDR_MT) {
 		ci->ci_data.cpu_smt_id = __SHIFTOUT(mpidr, MPIDR_AFF0);
 		ci->ci_data.cpu_core_id = __SHIFTOUT(mpidr, MPIDR_AFF1);
@@ -141,9 +144,9 @@ cpu_attach(device_t dv, cpuid_t id)
 	ci->ci_dev = dv;
 	dv->dv_private = ci;
 
-	cpu_identify(ci->ci_dev, ci, midr, mpidr);
+	cpu_identify(ci->ci_dev, ci);
 #ifdef MULTIPROCESSOR
-	if (id != 0) {
+	if (unit != 0) {
 		mi_cpu_attach(ci);
 		return;
 	}
@@ -152,6 +155,8 @@ cpu_attach(device_t dv, cpuid_t id)
 	fpu_attach(ci);
 
 	cpu_identify1(dv, ci);
+	aarch64_getcacheinfo();
+	aarch64_printcacheinfo(dv);
 	cpu_identify2(dv, ci);
 }
 
@@ -196,91 +201,13 @@ identify_aarch64_model(uint32_t cpuid, c
 	snprintf(buf, len, "unknown CPU (ID = 0x%08x)", cpuid);
 }
 
-static int
-prt_cache(device_t self, int level)
-{
-	struct aarch64_cache_info *cinfo;
-	struct aarch64_cache_unit *cunit;
-	u_int purging;
-	int i;
-	const char *cacheable, *cachetype;
-
-	cinfo = &aarch64_cache_info[level];
-
-	if (cinfo->cacheable == CACHE_CACHEABLE_NONE)
-		return -1;
-
-	for (i = 0; i < 2; i++) {
-		switch (cinfo->cacheable) {
-		case CACHE_CACHEABLE_ICACHE:
-			cunit = &cinfo->icache;
-			cacheable = "Instruction";
-			break;
-		case CACHE_CACHEABLE_DCACHE:
-			cunit = &cinfo->dcache;
-			cacheable = "Data";
-			break;
-		case CACHE_CACHEABLE_IDCACHE:
-			if (i == 0) {
-				cunit = &cinfo->icache;
-				cacheable = "Instruction";
-			} else {
-				cunit = &cinfo->dcache;
-				cacheable = "Data";
-			}
-			break;
-		case CACHE_CACHEABLE_UNIFIED:
-			cunit = &cinfo->dcache;
-			cacheable = "Unified";
-			break;
-		default:
-			cunit = &cinfo->dcache;
-			cacheable = "*UNK*";
-			break;
-		}
-
-		switch (cunit->cache_type) {
-		case CACHE_TYPE_VIVT:
-			cachetype = "VIVT";
-			break;
-		case CACHE_TYPE_VIPT:
-			cachetype = "VIPT";
-			break;
-		case CACHE_TYPE_PIPT:
-			cachetype = "PIPT";
-			break;
-		default:
-			cachetype = "*UNK*";
-			break;
-		}
-
-		purging = cunit->cache_purging;
-		aprint_normal_dev(self,
-		    "L%d %dKB/%dB %d-way%s%s%s%s %s %s cache\n",
-		    level + 1,
-		    cunit->cache_size / 1024,
-		    cunit->cache_line_size,
-		    cunit->cache_ways,
-		    (purging & CACHE_PURGING_WT) ? " write-through" : "",
-		    (purging & CACHE_PURGING_WB) ? " write-back" : "",
-		    (purging & CACHE_PURGING_RA) ? " read-allocate" : "",
-		    (purging & CACHE_PURGING_WA) ? " write-allocate" : "",
-		    cachetype, cacheable);
-
-		if (cinfo->cacheable != CACHE_CACHEABLE_IDCACHE)
-			break;
-	}
-
-	return 0;
-}
-
 static void
-cpu_identify(device_t self, struct cpu_info *ci, uint32_t midr, uint64_t mpidr)
+cpu_identify(device_t self, struct cpu_info *ci)
 {
 	char model[128];
 
-	identify_aarch64_model(midr, model, sizeof(model));
-	if (ci->ci_cpuid == 0)
+	identify_aarch64_model(ci->ci_midr, model, sizeof(model));
+	if (ci->ci_index == 0)
 		cpu_setmodel("%s", model);
 
 	aprint_naive("\n");
@@ -292,7 +219,6 @@ cpu_identify(device_t self, struct cpu_i
 static void
 cpu_identify1(device_t self, struct cpu_info *ci)
 {
-	int level;
 	uint32_t ctr, sctlr;	/* for cache */
 
 	/* SCTLR - System Control Register */
@@ -339,11 +265,6 @@ cpu_identify1(device_t self, struct cpu_
 	aprint_normal_dev(self, "Dcache line %ld, Icache line %ld\n",
 	    sizeof(int) << __SHIFTOUT(ctr, CTR_EL0_DMIN_LINE),
 	    sizeof(int) << __SHIFTOUT(ctr, CTR_EL0_IMIN_LINE));
-
-	for (level = 0; level < MAX_CACHE_LEVEL; level++) {
-		if (prt_cache(self, level) < 0)
-			break;
-	}
 }
 
 
@@ -508,7 +429,7 @@ cpu_boot_secondary_processors(void)
 	__asm __volatile ("sev; sev; sev");
 
 	/* wait all cpus have done cpu_hatch() */
-	while (arm_cpu_mbox) {
+	while (membar_consumer(), arm_cpu_mbox & arm_cpu_hatched) {
 		__asm __volatile ("wfe");
 	}
 
@@ -531,6 +452,8 @@ cpu_hatch(struct cpu_info *ci)
 	fpu_attach(ci);
 
 	cpu_identify1(ci->ci_dev, ci);
+	aarch64_getcacheinfo();
+	aarch64_printcacheinfo(ci->ci_dev);
 	cpu_identify2(ci->ci_dev, ci);
 
 	mutex_exit(&cpu_hatch_lock);
@@ -544,8 +467,13 @@ cpu_hatch(struct cpu_info *ci)
 	MD_CPU_HATCH(ci);	/* for non-fdt arch? */
 #endif
 
-	/* clear my bit of arm_cpu_mbox to tell cpu_boot_secondary_processors() */
-	atomic_and_32(&arm_cpu_mbox, ~(1 << ci->ci_cpuid));
+	/*
+	 * clear my bit of arm_cpu_mbox to tell cpu_boot_secondary_processors().
+	 * there are cpu0,1,2,3, and if cpu2 is unresponsive,
+	 * ci_index are each cpu0=0, cpu1=1, cpu2=undef, cpu3=2.
+	 * therefore we have to use device_unit instead of ci_index for mbox.
+	 */
+	atomic_and_32(&arm_cpu_mbox, ~__BIT(device_unit(ci->ci_dev)));
 	__asm __volatile ("sev; sev; sev");
 }
 #endif /* MULTIPROCESSOR */

Index: src/sys/arch/aarch64/aarch64/cpufunc.c
diff -u src/sys/arch/aarch64/aarch64/cpufunc.c:1.2 src/sys/arch/aarch64/aarch64/cpufunc.c:1.3
--- src/sys/arch/aarch64/aarch64/cpufunc.c:1.2	Tue Jul 17 00:30:34 2018
+++ src/sys/arch/aarch64/aarch64/cpufunc.c	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpufunc.c,v 1.2 2018/07/17 00:30:34 christos Exp $	*/
+/*	$NetBSD: cpufunc.c,v 1.3 2018/08/26 18:15:49 ryo Exp $	*/
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <r...@nerv.org>
@@ -27,27 +27,31 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpufunc.c,v 1.2 2018/07/17 00:30:34 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpufunc.c,v 1.3 2018/08/26 18:15:49 ryo Exp $");
 
-#include <sys/types.h>
 #include <sys/param.h>
-#include <sys/systm.h>
-#include <aarch64/armreg.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+
+#include <aarch64/cpu.h>
 #include <aarch64/cpufunc.h>
 
-u_int cputype;	/* compat arm */
+u_int cputype;			/* compat arm */
+u_int arm_dcache_align;		/* compat arm */
+u_int arm_dcache_align_mask;	/* compat arm */
+u_int arm_dcache_maxline;
 
-/* L1-L8 cache info */
-struct aarch64_cache_info aarch64_cache_info[MAX_CACHE_LEVEL];
 u_int aarch64_cache_vindexsize;
 u_int aarch64_cache_prefer_mask;
 
-u_int arm_dcache_minline;
-u_int arm_dcache_align;
-u_int arm_dcache_align_mask;
+/* cache info per cluster. the same cluster has the same cache configuration? */
+#define MAXCPUPACKAGES	MAXCPUS		/* maximum of ci->ci_package_id */
+static struct aarch64_cache_info *aarch64_cacheinfo[MAXCPUPACKAGES];
+
 
 static void
-extract_cacheunit(int level, bool insn, int cachetype)
+extract_cacheunit(int level, bool insn, int cachetype,
+    struct aarch64_cache_info *cacheinfo)
 {
 	struct aarch64_cache_unit *cunit;
 	uint32_t ccsidr;
@@ -60,9 +64,9 @@ extract_cacheunit(int level, bool insn, 
 	ccsidr = reg_ccsidr_el1_read();
 
 	if (insn)
-		cunit = &aarch64_cache_info[level].icache;
+		cunit = &cacheinfo[level].icache;
 	else
-		cunit = &aarch64_cache_info[level].dcache;
+		cunit = &cacheinfo[level].dcache;
 
 	cunit->cache_type = cachetype;
 
@@ -81,13 +85,30 @@ extract_cacheunit(int level, bool insn, 
 	cunit->cache_purging |= (ccsidr & CCSIDR_WA) ? CACHE_PURGING_WA : 0;
 }
 
-int
+void
 aarch64_getcacheinfo(void)
 {
 	uint32_t clidr, ctr;
 	int level, cachetype;
+	struct aarch64_cache_info *cinfo;
+
+	if (cputype == 0)
+		cputype = aarch64_cpuid();
+
+	/* already extract about this cluster? */
+	KASSERT(curcpu()->ci_package_id < MAXCPUPACKAGES);
+	cinfo = aarch64_cacheinfo[curcpu()->ci_package_id];
+	if (cinfo != NULL) {
+		curcpu()->ci_cacheinfo = cinfo;
+		return;
+	}
+
+	cinfo = aarch64_cacheinfo[curcpu()->ci_package_id] =
+	    kmem_zalloc(sizeof(struct aarch64_cache_info) * MAX_CACHE_LEVEL,
+	    KM_NOSLEEP);
+	KASSERT(cinfo != NULL);
+	curcpu()->ci_cacheinfo = cinfo;
 
-	cputype = aarch64_cpuid();
 
 	/*
 	 * CTR - Cache Type Register
@@ -108,9 +129,12 @@ aarch64_getcacheinfo(void)
 		break;
 	}
 
-	arm_dcache_minline = __SHIFTOUT(ctr, CTR_EL0_DMIN_LINE);
-	arm_dcache_align = sizeof(int) << arm_dcache_minline;
-	arm_dcache_align_mask = arm_dcache_align - 1;
+	/* remember maximum alignment */
+	if (arm_dcache_maxline < __SHIFTOUT(ctr, CTR_EL0_DMIN_LINE)) {
+		arm_dcache_maxline = __SHIFTOUT(ctr, CTR_EL0_DMIN_LINE);
+		arm_dcache_align = sizeof(int) << arm_dcache_maxline;
+		arm_dcache_align_mask = arm_dcache_align - 1;
+	}
 
 	/*
 	 * CLIDR -  Cache Level ID Register
@@ -130,27 +154,27 @@ aarch64_getcacheinfo(void)
 			break;
 		case CLIDR_TYPE_ICACHE:
 			cacheable = CACHE_CACHEABLE_ICACHE;
-			extract_cacheunit(level, true, cachetype);
+			extract_cacheunit(level, true, cachetype, cinfo);
 			break;
 		case CLIDR_TYPE_DCACHE:
 			cacheable = CACHE_CACHEABLE_DCACHE;
-			extract_cacheunit(level, false, CACHE_TYPE_PIPT);
+			extract_cacheunit(level, false, CACHE_TYPE_PIPT, cinfo);
 			break;
 		case CLIDR_TYPE_IDCACHE:
 			cacheable = CACHE_CACHEABLE_IDCACHE;
-			extract_cacheunit(level, true, cachetype);
-			extract_cacheunit(level, false, CACHE_TYPE_PIPT);
+			extract_cacheunit(level, true, cachetype, cinfo);
+			extract_cacheunit(level, false, CACHE_TYPE_PIPT, cinfo);
 			break;
 		case CLIDR_TYPE_UNIFIEDCACHE:
 			cacheable = CACHE_CACHEABLE_UNIFIED;
-			extract_cacheunit(level, false, CACHE_TYPE_PIPT);
+			extract_cacheunit(level, false, CACHE_TYPE_PIPT, cinfo);
 			break;
 		default:
 			cacheable = CACHE_CACHEABLE_NONE;
 			break;
 		}
 
-		aarch64_cache_info[level].cacheable = cacheable;
+		cinfo[level].cacheable = cacheable;
 		if (cacheable == CACHE_CACHEABLE_NONE) {
 			/* no more level */
 			break;
@@ -164,24 +188,112 @@ aarch64_getcacheinfo(void)
 	}
 
 	/* calculate L1 icache virtual index size */
-	if (((aarch64_cache_info[0].icache.cache_type == CACHE_TYPE_VIVT) ||
-	     (aarch64_cache_info[0].icache.cache_type == CACHE_TYPE_VIPT)) &&
-	    ((aarch64_cache_info[0].cacheable == CACHE_CACHEABLE_ICACHE) ||
-	     (aarch64_cache_info[0].cacheable == CACHE_CACHEABLE_IDCACHE))) {
+	if (((cinfo[0].icache.cache_type == CACHE_TYPE_VIVT) ||
+	     (cinfo[0].icache.cache_type == CACHE_TYPE_VIPT)) &&
+	    ((cinfo[0].cacheable == CACHE_CACHEABLE_ICACHE) ||
+	     (cinfo[0].cacheable == CACHE_CACHEABLE_IDCACHE))) {
 
 		aarch64_cache_vindexsize =
-		    aarch64_cache_info[0].icache.cache_size /
-		    aarch64_cache_info[0].icache.cache_ways;
+		    cinfo[0].icache.cache_size /
+		    cinfo[0].icache.cache_ways;
 
 		KASSERT(aarch64_cache_vindexsize != 0);
 		aarch64_cache_prefer_mask = aarch64_cache_vindexsize - 1;
 	} else {
 		aarch64_cache_vindexsize = 0;
 	}
+}
+
+static int
+prt_cache(device_t self, struct aarch64_cache_info *cinfo, int level)
+{
+	struct aarch64_cache_unit *cunit;
+	u_int purging;
+	int i;
+	const char *cacheable, *cachetype;
+
+	if (cinfo[level].cacheable == CACHE_CACHEABLE_NONE)
+		return -1;
+
+	for (i = 0; i < 2; i++) {
+		switch (cinfo[level].cacheable) {
+		case CACHE_CACHEABLE_ICACHE:
+			cunit = &cinfo[level].icache;
+			cacheable = "Instruction";
+			break;
+		case CACHE_CACHEABLE_DCACHE:
+			cunit = &cinfo[level].dcache;
+			cacheable = "Data";
+			break;
+		case CACHE_CACHEABLE_IDCACHE:
+			if (i == 0) {
+				cunit = &cinfo[level].icache;
+				cacheable = "Instruction";
+			} else {
+				cunit = &cinfo[level].dcache;
+				cacheable = "Data";
+			}
+			break;
+		case CACHE_CACHEABLE_UNIFIED:
+			cunit = &cinfo[level].dcache;
+			cacheable = "Unified";
+			break;
+		default:
+			cunit = &cinfo[level].dcache;
+			cacheable = "*UNK*";
+			break;
+		}
+
+		switch (cunit->cache_type) {
+		case CACHE_TYPE_VIVT:
+			cachetype = "VIVT";
+			break;
+		case CACHE_TYPE_VIPT:
+			cachetype = "VIPT";
+			break;
+		case CACHE_TYPE_PIPT:
+			cachetype = "PIPT";
+			break;
+		default:
+			cachetype = "*UNK*";
+			break;
+		}
+
+		purging = cunit->cache_purging;
+		aprint_normal_dev(self,
+		    "L%d %dKB/%dB %d-way%s%s%s%s %s %s cache\n",
+		    level + 1,
+		    cunit->cache_size / 1024,
+		    cunit->cache_line_size,
+		    cunit->cache_ways,
+		    (purging & CACHE_PURGING_WT) ? " write-through" : "",
+		    (purging & CACHE_PURGING_WB) ? " write-back" : "",
+		    (purging & CACHE_PURGING_RA) ? " read-allocate" : "",
+		    (purging & CACHE_PURGING_WA) ? " write-allocate" : "",
+		    cachetype, cacheable);
+
+		if (cinfo[level].cacheable != CACHE_CACHEABLE_IDCACHE)
+			break;
+	}
 
 	return 0;
 }
 
+void
+aarch64_printcacheinfo(device_t dev)
+{
+	struct aarch64_cache_info *cinfo;
+	int level;
+
+	cinfo = curcpu()->ci_cacheinfo;
+
+	for (level = 0; level < MAX_CACHE_LEVEL; level++)
+		if (prt_cache(dev, cinfo, level) < 0)
+			break;
+}
+
+
+
 static inline void
 ln_dcache_wb_all(int level, struct aarch64_cache_unit *cunit)
 {
@@ -239,14 +351,17 @@ ln_dcache_inv_all(int level, struct aarc
 void
 aarch64_dcache_wbinv_all(void)
 {
+	struct aarch64_cache_info *cinfo;
 	int level;
 
+	cinfo = curcpu()->ci_cacheinfo;
+
 	for (level = 0; level < MAX_CACHE_LEVEL; level++) {
-		if (aarch64_cache_info[level].cacheable == CACHE_CACHEABLE_NONE)
+		if (cinfo[level].cacheable == CACHE_CACHEABLE_NONE)
 			break;
 
 		__asm __volatile ("dsb ish");
-		ln_dcache_wbinv_all(level, &aarch64_cache_info[level].dcache);
+		ln_dcache_wbinv_all(level, &cinfo[level].dcache);
 	}
 	__asm __volatile ("dsb ish");
 }
@@ -254,14 +369,17 @@ aarch64_dcache_wbinv_all(void)
 void
 aarch64_dcache_inv_all(void)
 {
+	struct aarch64_cache_info *cinfo;
 	int level;
 
+	cinfo = curcpu()->ci_cacheinfo;
+
 	for (level = 0; level < MAX_CACHE_LEVEL; level++) {
-		if (aarch64_cache_info[level].cacheable == CACHE_CACHEABLE_NONE)
+		if (cinfo[level].cacheable == CACHE_CACHEABLE_NONE)
 			break;
 
 		__asm __volatile ("dsb ish");
-		ln_dcache_inv_all(level, &aarch64_cache_info[level].dcache);
+		ln_dcache_inv_all(level, &cinfo[level].dcache);
 	}
 	__asm __volatile ("dsb ish");
 }
@@ -269,14 +387,17 @@ aarch64_dcache_inv_all(void)
 void
 aarch64_dcache_wb_all(void)
 {
+	struct aarch64_cache_info *cinfo;
 	int level;
 
+	cinfo = curcpu()->ci_cacheinfo;
+
 	for (level = 0; level < MAX_CACHE_LEVEL; level++) {
-		if (aarch64_cache_info[level].cacheable == CACHE_CACHEABLE_NONE)
+		if (cinfo[level].cacheable == CACHE_CACHEABLE_NONE)
 			break;
 
 		__asm __volatile ("dsb ish");
-		ln_dcache_wb_all(level, &aarch64_cache_info[level].dcache);
+		ln_dcache_wb_all(level, &cinfo[level].dcache);
 	}
 	__asm __volatile ("dsb ish");
 }

Index: src/sys/arch/aarch64/aarch64/genassym.cf
diff -u src/sys/arch/aarch64/aarch64/genassym.cf:1.6 src/sys/arch/aarch64/aarch64/genassym.cf:1.7
--- src/sys/arch/aarch64/aarch64/genassym.cf:1.6	Fri Aug  3 16:32:55 2018
+++ src/sys/arch/aarch64/aarch64/genassym.cf	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-# $NetBSD: genassym.cf,v 1.6 2018/08/03 16:32:55 ryo Exp $
+# $NetBSD: genassym.cf,v 1.7 2018/08/26 18:15:49 ryo Exp $
 #-
 # Copyright (c) 2014 The NetBSD Foundation, Inc.
 # All rights reserved.
@@ -299,10 +299,13 @@ define	FPREG_Q31		offsetof(struct fpreg,
 define	FPREG_FPCR		offsetof(struct fpreg, fpcr)
 define	FPREG_FPSR		offsetof(struct fpreg, fpsr)
 
+define	CPU_INFO_SIZE		sizeof(struct cpu_info)
 define	CI_CURPRIORITY		offsetof(struct cpu_info, ci_schedstate.spc_curpriority)
 define	CI_CURLWP		offsetof(struct cpu_info, ci_curlwp)
 define	CI_CPL			offsetof(struct cpu_info, ci_cpl)
 define	CI_CPUID		offsetof(struct cpu_info, ci_cpuid)
+define	CI_MIDR			offsetof(struct cpu_info, ci_midr)
+define	CI_MPIDR		offsetof(struct cpu_info, ci_mpidr)
 define	CI_ASTPENDING		offsetof(struct cpu_info, ci_astpending)
 define	CI_WANT_RESCHED		offsetof(struct cpu_info, ci_want_resched)
 define	CI_INTR_DEPTH		offsetof(struct cpu_info, ci_intr_depth)

Index: src/sys/arch/aarch64/aarch64/locore.S
diff -u src/sys/arch/aarch64/aarch64/locore.S:1.19 src/sys/arch/aarch64/aarch64/locore.S:1.20
--- src/sys/arch/aarch64/aarch64/locore.S:1.19	Fri Aug 24 19:06:30 2018
+++ src/sys/arch/aarch64/aarch64/locore.S	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: locore.S,v 1.19 2018/08/24 19:06:30 ryo Exp $	*/
+/*	$NetBSD: locore.S,v 1.20 2018/08/26 18:15:49 ryo Exp $	*/
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <r...@nerv.org>
@@ -35,7 +35,7 @@
 #include <aarch64/hypervisor.h>
 #include "assym.h"
 
-RCSID("$NetBSD: locore.S,v 1.19 2018/08/24 19:06:30 ryo Exp $")
+RCSID("$NetBSD: locore.S,v 1.20 2018/08/26 18:15:49 ryo Exp $")
 
 /* #define DEBUG_LOCORE */
 /* #define DEBUG_MMU */
@@ -241,14 +241,14 @@ END(aarch64_start)
 
 #if defined(VERBOSE_LOCORE) || defined(DEBUG_LOCORE)
 /*
- * print "[CPU$x27] " (x27 as cpuid)
+ * print "[CPU$x27] " (x27 as cpuindex)
  * XXX: max 4 digit
  */
 printcpu:
 	stp	x0, lr, [sp, #-16]!
 	stp	x25, x26, [sp, #-16]!
 	PRINT("[CPU")
-	mov	x26, x27		/* n = cpuid */
+	mov	x26, x27		/* n = cpuindex */
 	mov	x25, xzr		/* zeropad = 0 */
 	mov	x1, #1000
 	udiv	x0, x26, x1		/* x0 = n / 1000 */
@@ -294,25 +294,21 @@ printcpu:
 
 ENTRY_NP(aarch64_mpstart)
 ENTRY_NP(cortex_mpstart)	/* compat arm */
-	/*
-	 * XXX:
-	 *  cpuid(index) is read from MPIDR_EL1.AFF0. AFF1,2,3 are ignored.
-	 *  cpuid should be passed from primary processor...
-	 */
-	mrs	x27, mpidr_el1
-	and	x27, x27, #MPIDR_AFF0	/* XXX: cpuid = mpidr_el1 & Aff0 */
+	ADDR	x0, arm_cpu_hatch_arg	/* from cpu0 */
+	ldr	w27, [x0]		/* x27 = cpuindex */
 	mov	x0, #1
-	lsl	x28, x0, x27		/* x28 = 1 << cpuid */
-	mov	x0, x28
+	lsl	x28, x0, x27		/* x28 = 1 << cpuindex */
 
-	/* x27 = cpuid, x28 = (1 << cpuid) */
+	/* x27 = cpuindex, x28 = (1 << cpuindex) */
+	cmp	x27, MAXCPUS
+	bge	toomanycpus
 
 	/* set stack pointer for boot */
 #define BOOT_STACKSIZE	256
 	mov	x1, #BOOT_STACKSIZE
 	mul	x1, x1, x27
 	ADDR	x0, bootstk_cpus
-	sub	sp, x0, x1	/* sp = bootstk_cpus - BOOT_STACKSIZE * cpuid */
+	sub	sp, x0, x1	/* sp= bootstk_cpus-(BOOT_STACKSIZE*cpuindex) */
 
 #ifdef DEBUG_LOCORE
 	PRINTCPU()
@@ -384,7 +380,7 @@ mp_vstart:
 	PRINTCPU()
 	PRINT("arm_cpu_hatched  = ")
 	ADDR	x0, _C_LABEL(arm_cpu_hatched)
-	ldr	x0, [x0]
+	ldr	w0, [x0]
 	bl	print_x0
 
 	PRINTCPU()
@@ -393,17 +389,22 @@ mp_vstart:
 	bl	print_x0
 #endif
 
-	ADDR	x0, _C_LABEL(cpus_midr)
-	mrs	x1, midr_el1
-	str	w1, [x0, x27, lsl #2]	/* cpu_midr[cpuid] = midr_el1 */
+	msr	tpidr_el0, xzr		/* tpidr_el0 (for TLS) = NULL */
 
-	ADDR	x0, _C_LABEL(cpus_mpidr)
-	mrs	x1, mpidr_el1
-	str	x1, [x0, x27, lsl #3]	/* cpu_mpidr[cpuid] = mpidr_el1 */
+	/* set curcpu(), and fill curcpu()->ci_{midr,mpidr} */
+	mov	x0, #CPU_INFO_SIZE
+	mul	x0, x27, x0
+	ADDR	x1, _C_LABEL(cpu_info_store)
+	add	x0, x0, x1		/* x0 = &cpu_info_store[cpuindex] */
+	msr	tpidr_el1, x0		/* tpidr_el1 = curcpu() = x0 */
 
+	mrs	x1, midr_el1
+	str	x1, [x0, #CI_MIDR]	/* curcpu()->ci_cpuid = midr_el1 */
+	mrs	x1, mpidr_el1
+	str	x1, [x0, #CI_MPIDR]	/* curcpu()->ci_mpidr = mpidr_el1 */
 
 	/*
-	 * atomic_or_32(&arm_cpu_hatched, 1 << cpuid)
+	 * atomic_or_32(&arm_cpu_hatched, (1 << cpuindex))
 	 * to tell my activity to primary processor.
 	 */
 	ADDR	x0, _C_LABEL(arm_cpu_hatched)
@@ -415,7 +416,7 @@ mp_vstart:
 	PRINTCPU()
 	PRINT("arm_cpu_hatched -> ")
 	ADDR	x0, _C_LABEL(arm_cpu_hatched)
-	ldr	x0, [x0]
+	ldr	w0, [x0]
 	bl	print_x0
 #endif
 
@@ -438,7 +439,7 @@ mp_vstart:
 #ifdef DEBUG_LOCORE
 	/* XXX: delay to prevent the mixing of console output */
 	mov	x0, #0x4000000
-	mul	x0, x0, x27	/* delay (cpuid * 0x4000000) */
+	mul	x0, x0, x27	/* delay (cpuindex * 0x4000000) */
 1:	subs	x0, x0, #1
 	bne	1b
 
@@ -452,12 +453,8 @@ mp_vstart:
 	bl	print_x0
 #endif
 
-	msr	tpidr_el0, xzr		/* tpidr_el0 (for TLS) = NULL */
-
 	/* fill my cpu_info */
-	ADDR	x0, _C_LABEL(cpu_info)
-	ldr	x0, [x0, x27, lsl #3]	/* x0 = cpu_info[cpuid] */
-	msr	tpidr_el1, x0		/* tpidr_el1 = my cpu_info */
+	mrs	x0, tpidr_el1		/* curcpu() */
 
 	ldr	x1, [x0, #CI_IDLELWP]	/* x1 = curcpu()->ci_data.cpu_idlelwp */
 	str	x1, [x0, #CI_CURLWP]	/* curlwp is idlelwp */
@@ -473,6 +470,13 @@ mp_vstart:
 	b	_C_LABEL(idle_loop)	/* never to return */
 END(aarch64_mpstart)
 
+toomanycpus:
+	PRINTCPU()
+	PRINT("too many cpus\r\n")
+1:	wfi
+	b	1b
+
+
 #else /* MULTIPROCESSOR */
 
 ENTRY_NP(aarch64_mpstart)

Index: src/sys/arch/aarch64/include/cpu.h
diff -u src/sys/arch/aarch64/include/cpu.h:1.6 src/sys/arch/aarch64/include/cpu.h:1.7
--- src/sys/arch/aarch64/include/cpu.h:1.6	Wed Aug  8 19:01:15 2018
+++ src/sys/arch/aarch64/include/cpu.h	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.h,v 1.6 2018/08/08 19:01:15 jmcneill Exp $ */
+/* $NetBSD: cpu.h,v 1.7 2018/08/26 18:15:49 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -38,6 +38,8 @@
 #include "opt_multiprocessor.h"
 #endif
 
+#include <sys/param.h>
+
 #if defined(_KERNEL) || defined(_KMEMUSER)
 #include <sys/evcnt.h>
 #include <aarch64/frame.h>
@@ -82,7 +84,13 @@ struct cpu_info {
 	/* interrupt controller */
 	u_int ci_gic_redist;	/* GICv3 redistributor index */
 	uint64_t ci_gic_sgir;	/* GICv3 SGIR target */
-};
+
+	uint64_t ci_midr;	/* MIDR_EL1 */
+	uint64_t ci_mpidr;	/* MPIDR_EL1 */
+
+	struct aarch64_cache_info *ci_cacheinfo;
+
+} __aligned(COHERENCY_UNIT);
 
 static inline struct cpu_info *
 curcpu(void)
@@ -103,8 +111,7 @@ void cpu_hatch(struct cpu_info *);
 
 extern struct cpu_info *cpu_info[];
 extern volatile u_int arm_cpu_hatched;	/* MULTIPROCESSOR */
-extern uint32_t cpus_midr[];		/* MULTIPROCESSOR */
-extern uint64_t cpus_mpidr[];		/* MULTIPROCESSOR */
+extern volatile u_int arm_cpu_hatch_arg;/* MULTIPROCESSOR */
 
 #define CPU_INFO_ITERATOR	cpuid_t
 #ifdef MULTIPROCESSOR

Index: src/sys/arch/aarch64/include/cpufunc.h
diff -u src/sys/arch/aarch64/include/cpufunc.h:1.2 src/sys/arch/aarch64/include/cpufunc.h:1.3
--- src/sys/arch/aarch64/include/cpufunc.h:1.2	Mon Jul 23 22:51:39 2018
+++ src/sys/arch/aarch64/include/cpufunc.h	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpufunc.h,v 1.2 2018/07/23 22:51:39 ryo Exp $	*/
+/*	$NetBSD: cpufunc.h,v 1.3 2018/08/26 18:15:49 ryo Exp $	*/
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <r...@nerv.org>
@@ -74,7 +74,8 @@ extern u_int aarch64_cache_vindexsize;	/
 extern u_int aarch64_cache_prefer_mask;
 extern u_int cputype;			/* compat arm */
 
-int aarch64_getcacheinfo(void);
+void aarch64_getcacheinfo(void);
+void aarch64_printcacheinfo(device_t);
 
 void aarch64_dcache_wbinv_all(void);
 void aarch64_dcache_inv_all(void);

Index: src/sys/arch/arm/broadcom/bcm283x_platform.c
diff -u src/sys/arch/arm/broadcom/bcm283x_platform.c:1.14 src/sys/arch/arm/broadcom/bcm283x_platform.c:1.15
--- src/sys/arch/arm/broadcom/bcm283x_platform.c:1.14	Sat Aug 25 20:55:15 2018
+++ src/sys/arch/arm/broadcom/bcm283x_platform.c	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: bcm283x_platform.c,v 1.14 2018/08/25 20:55:15 rin Exp $	*/
+/*	$NetBSD: bcm283x_platform.c,v 1.15 2018/08/26 18:15:49 ryo Exp $	*/
 
 /*-
  * Copyright (c) 2017 Jared D. McNeill <jmcne...@invisible.ca>
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: bcm283x_platform.c,v 1.14 2018/08/25 20:55:15 rin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: bcm283x_platform.c,v 1.15 2018/08/26 18:15:49 ryo Exp $");
 
 #include "opt_arm_debug.h"
 #include "opt_bcm283x.h"
@@ -738,12 +738,19 @@ bcm2836_bootstrap(void)
 #endif
 #endif /* MULTIPROCESSOR */
 
-#ifdef __aarch64__
 	/*
-	 * XXX: use psci_fdt_bootstrap()
+	 * XXX: TODO:
+	 *   should make cpu_fdt_bootstrap() that support spin-table and use it
+	 *   to share with arm/aarch64.
 	 */
+#ifdef __aarch64__
 	extern void aarch64_mpstart(void);
 	for (int i = 1; i < RPI_CPU_MAX; i++) {
+		/* argument for mpstart() */
+		arm_cpu_hatch_arg = i;
+		cpu_dcache_wb_range((vaddr_t)&arm_cpu_hatch_arg,
+		    sizeof(arm_cpu_hatch_arg));
+
 		/*
 		 * Reference:
 		 *   armstubs/armstub8.S
@@ -753,16 +760,22 @@ bcm2836_bootstrap(void)
 #define RPI3_ARMSTUB8_SPINADDR_BASE	0x000000d8
 		cpu_release_addr = (void *)
 		    AARCH64_PA_TO_KVA(RPI3_ARMSTUB8_SPINADDR_BASE + i * 8);
-		*cpu_release_addr = aarch64_kern_vtophys((vaddr_t)aarch64_mpstart);
+		*cpu_release_addr =
+		    aarch64_kern_vtophys((vaddr_t)aarch64_mpstart);
 
 		/* need flush cache. secondary processors are cache disabled */
-		cpu_dcache_wb_range((vaddr_t)cpu_release_addr, sizeof(cpu_release_addr));
+		cpu_dcache_wb_range((vaddr_t)cpu_release_addr,
+		    sizeof(cpu_release_addr));
+		/* Wake up AP in case firmware has placed it in WFE state */
 		__asm __volatile("sev" ::: "memory");
 
-#if defined(VERBOSE_INIT_ARM) && defined(EARLYCONS)
-		/* wait secondary processor's debug output */
-		gtmr_delay(100000);
-#endif
+		/* Wait for APs to start */
+		for (int loop = 0; loop < 16; loop++) {
+			membar_consumer();
+			if (arm_cpu_hatched & __BIT(i))
+				break;
+			gtmr_delay(10000);
+		}
 	}
 #endif /* __aarch64__ */
 
@@ -772,6 +785,7 @@ bcm2836_bootstrap(void)
 	 * It is need to initialize the secondary CPU,
 	 * and go into wfi loop (cortex_mpstart),
 	 * otherwise system would be freeze...
+	 * (because netbsd will use the spinning address)
 	 */
 	extern void cortex_mpstart(void);
 
@@ -782,29 +796,26 @@ bcm2836_bootstrap(void)
 		bus_space_write_4(iot, ioh,
 		    BCM2836_LOCAL_MAILBOX3_SETN(i),
 		    (uint32_t)cortex_mpstart);
+		/* Wake up AP in case firmware has placed it in WFE state */
+		__asm __volatile("sev" ::: "memory");
+
+		/* Wait for APs to start */
+		for (int loop = 0; loop < 16; loop++) {
+			membar_consumer();
+			if (arm_cpu_hatched & __BIT(i))
+				break;
+			gtmr_delay(10000);
+		}
 	}
 #endif
 
 #ifdef MULTIPROCESSOR
-	/* Wake up AP in case firmware has placed it in WFE state */
-	__asm __volatile("sev" ::: "memory");
-
-	for (int loop = 0; loop < 16; loop++) {
-		if (arm_cpu_hatched == __BITS(arm_cpu_max - 1, 1))
-			break;
-		gtmr_delay(10000);
-	}
-
 	for (size_t i = 1; i < arm_cpu_max; i++) {
 		if ((arm_cpu_hatched & (1 << i)) == 0) {
 			printf("%s: warning: cpu%zu failed to hatch\n",
 			    __func__, i);
 		}
 	}
-#if defined(VERBOSE_INIT_ARM) && defined(EARLYCONS)
-	/* for viewability of secondary processor's debug outputs */
-	printf("\n");
-#endif
 #endif
 }
 

Index: src/sys/arch/arm/fdt/cpu_fdt.c
diff -u src/sys/arch/arm/fdt/cpu_fdt.c:1.8 src/sys/arch/arm/fdt/cpu_fdt.c:1.9
--- src/sys/arch/arm/fdt/cpu_fdt.c:1.8	Mon Jul  2 16:36:49 2018
+++ src/sys/arch/arm/fdt/cpu_fdt.c	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu_fdt.c,v 1.8 2018/07/02 16:36:49 jmcneill Exp $ */
+/* $NetBSD: cpu_fdt.c,v 1.9 2018/08/26 18:15:49 ryo Exp $ */
 
 /*-
  * Copyright (c) 2017 Jared McNeill <jmcne...@invisible.ca>
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu_fdt.c,v 1.8 2018/07/02 16:36:49 jmcneill Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu_fdt.c,v 1.9 2018/08/26 18:15:49 ryo Exp $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -97,15 +97,17 @@ cpu_fdt_match(device_t parent, cfdata_t 
 	switch (type) {
 	case ARM_CPU_ARMV7:
 	case ARM_CPU_ARMV8:
-		/* XXX NetBSD requires all CPUs to be in the same cluster */
 		if (fdtbus_get_reg(phandle, 0, &mpidr, NULL) != 0)
 			return 0;
 
+#ifndef __aarch64__
+		/* XXX NetBSD/arm requires all CPUs to be in the same cluster */
 		const u_int bp_clid = cpu_clusterid();
 		const u_int clid = __SHIFTOUT(mpidr, MPIDR_AFF1);
 
 		if (bp_clid != clid)
 			return 0;
+#endif
 		break;
 	default:
 		break;
@@ -136,8 +138,10 @@ cpu_fdt_attach(device_t parent, device_t
 			aprint_error(": missing 'reg' property\n");
 			return;
 		}
-
-		cpuid = __SHIFTOUT(mpidr, MPIDR_AFF0);
+#ifndef __aarch64__
+		mpidr = __SHIFTOUT(mpidr, MPIDR_AFF0);
+#endif
+		cpuid = mpidr;
 		break;
 	default:
 		cpuid = 0;

Index: src/sys/arch/arm/fdt/psci_fdt.c
diff -u src/sys/arch/arm/fdt/psci_fdt.c:1.14 src/sys/arch/arm/fdt/psci_fdt.c:1.15
--- src/sys/arch/arm/fdt/psci_fdt.c:1.14	Fri Aug 24 21:56:13 2018
+++ src/sys/arch/arm/fdt/psci_fdt.c	Sun Aug 26 18:15:49 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: psci_fdt.c,v 1.14 2018/08/24 21:56:13 ryo Exp $ */
+/* $NetBSD: psci_fdt.c,v 1.15 2018/08/26 18:15:49 ryo Exp $ */
 
 /*-
  * Copyright (c) 2017 Jared McNeill <jmcne...@invisible.ca>
@@ -29,7 +29,7 @@
 #include "opt_multiprocessor.h"
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: psci_fdt.c,v 1.14 2018/08/24 21:56:13 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: psci_fdt.c,v 1.15 2018/08/26 18:15:49 ryo Exp $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -42,6 +42,7 @@ __KERNEL_RCSID(0, "$NetBSD: psci_fdt.c,v
 
 #include <arm/locore.h>
 #include <arm/armreg.h>
+#include <arm/cpufunc.h>
 
 #include <arm/arm/psci.h>
 #include <arm/fdt/psci_fdt.h>
@@ -172,8 +173,8 @@ void
 psci_fdt_bootstrap(void)
 {
 #ifdef MULTIPROCESSOR
-	extern void cortex_mpstart(void);
 	uint64_t mpidr, bp_mpidr;
+	u_int cpuindex;
 	int child;
 	const char *devtype;
 
@@ -199,7 +200,7 @@ psci_fdt_bootstrap(void)
 	bp_mpidr = cpu_mpidr_aff_read();
 
 	/* Boot APs */
-	uint32_t started = 0;
+	cpuindex = 1;
 	for (child = OF_child(cpus); child; child = OF_peer(child)) {
 		if (!fdtbus_status_okay(child))
 			continue;
@@ -208,21 +209,25 @@ psci_fdt_bootstrap(void)
 		if (mpidr == bp_mpidr)
 			continue; 	/* BP already started */
 
-		/* XXX NetBSD requires all CPUs to be in the same cluster */
-		if ((mpidr & ~MPIDR_AFF0) != (bp_mpidr & ~MPIDR_AFF0))
+#ifdef __aarch64__
+		/* argument for mpstart() */
+		arm_cpu_hatch_arg = cpuindex;
+		cpu_dcache_wb_range((vaddr_t)&arm_cpu_hatch_arg,
+		    sizeof(arm_cpu_hatch_arg));
+#endif
+
+		int ret = psci_cpu_on(cpuindex, psci_fdt_mpstart_pa(), 0);
+		if (ret != PSCI_SUCCESS)
 			continue;
 
-		const u_int cpuid = __SHIFTOUT(mpidr, MPIDR_AFF0);
-		int ret = psci_cpu_on(mpidr, psci_fdt_mpstart_pa(), 0);
-		if (ret == PSCI_SUCCESS)
-			started |= __BIT(cpuid);
-	}
+		/* Wait for APs to start */
+		for (u_int i = 0x4000000; i > 0; i--) {
+			membar_consumer();
+			if (arm_cpu_hatched & __BIT(cpuindex))
+				break;
+		}
 
-	/* Wait for APs to start */
-	for (u_int i = 0x10000000; i > 0; i--) {
-		membar_consumer();
-		if (arm_cpu_hatched == started)
-			break;
+		cpuindex++;
 	}
 #endif
 }

Reply via email to