On Mon, Nov 04, 2013 at 11:54:34AM +0100, Paolo Bonzini wrote: > Il 04/11/2013 11:07, Michael S. Tsirkin ha scritto: > > On Mon, Nov 04, 2013 at 11:50:05AM +0200, Marcel Apfelbaum wrote: > >> On Mon, 2013-11-04 at 08:06 +0200, Michael S. Tsirkin wrote: > >>> The page table logic in exec.c assumes > >>> that memory addresses are at most TARGET_PHYS_ADDR_SPACE_BITS. > >>> > >>> But pci addresses are full 64 bit so if we try to render them ignoring > >>> the extra bits, we get strange effects with sections overlapping each > >>> other. > >>> > >>> To fix, simply limit the system memory size to > >>> 1 << TARGET_PHYS_ADDR_SPACE_BITS, > >>> pci addresses will be rendered within that. > >>> > >>> Signed-off-by: Michael S. Tsirkin <m...@redhat.com> > >>> --- > >>> exec.c | 6 +++++- > >>> 1 file changed, 5 insertions(+), 1 deletion(-) > >>> > >>> diff --git a/exec.c b/exec.c > >>> index 030118e..c7a8df5 100644 > >>> --- a/exec.c > >>> +++ b/exec.c > >>> @@ -1801,7 +1801,12 @@ void address_space_destroy_dispatch(AddressSpace > >>> *as) > >>> static void memory_map_init(void) > >>> { > >>> system_memory = g_malloc(sizeof(*system_memory)); > >>> - memory_region_init(system_memory, NULL, "system", INT64_MAX); > >>> + > >>> + assert(TARGET_PHYS_ADDR_SPACE_BITS <= 64); > >>> + > >>> + memory_region_init(system_memory, NULL, "system", > >>> + TARGET_PHYS_ADDR_SPACE_BITS == 64 ? > >>> + UINT64_MAX : (0x1ULL << > >>> TARGET_PHYS_ADDR_SPACE_BITS)); > >> > >> Michael, thanks again for the help. > >> > >> I am concerned that we cannot use all the UINT64_MAX > >> address space. > > > > Well, exec isn't ready for this, it expects at most > > TARGET_PHYS_ADDR_SPACE_BITS. > > Fortunately there's no way for CPU to initiate io outside > > this area. > > > > So this is another place where device to device IO > > would be broken. > > However, firmware that places BARs where the CPU cannot access them > would be "interesting" to say the least. This applies both to device- > device I/O and to non-aligned <4K BARs. > > This patch looks good; however, on top of it can you test > kvm-unit-tests with TARGET_PHYS_ADDR_SPACE_BITS=64 and see whether > there is a measurable slowdown (in the inl_from_qemu tests)? If not, > we can just get rid of TARGET_PHYS_ADDR_SPACE_BITS in exec.c.
I'd rather we fixed a bug first - we need to fix it on stable too - any cleanups can come on top. Also, I'm not sure what will this test tell us: inl reads io space, not memory, right? > > Note that L2_BITS is shared between translate-all.c and exec.c only for > historical reasons (the translate-all.c code used to be in exec.c). It > is probably a good idea to split them like this: Want to test this and post properly? > diff --git a/exec.c b/exec.c > index 2e31ffc..3faea0e 100644 > --- a/exec.c > +++ b/exec.c > @@ -88,7 +88,15 @@ struct PhysPageEntry { > uint16_t ptr : 15; > }; > > -typedef PhysPageEntry Node[L2_SIZE]; > +/* Size of the L2 (and L3, etc) page tables. */ > +#define ADDR_SPACE_BITS 64 > + > +#define P_L2_BITS 10 > +#define P_L2_SIZE (1 << P_L2_BITS) > + > +#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) > + 1) > + > +typedef PhysPageEntry Node[P_L2_SIZE]; > > struct AddressSpaceDispatch { > /* This is a multi-level map on the physical address space. > @@ -155,7 +163,7 @@ static uint16_t phys_map_node_alloc(void) > ret = next_map.nodes_nb++; > assert(ret != PHYS_MAP_NODE_NIL); > assert(ret != next_map.nodes_nb_alloc); > - for (i = 0; i < L2_SIZE; ++i) { > + for (i = 0; i < P_L2_SIZE; ++i) { > next_map.nodes[ret][i].is_leaf = 0; > next_map.nodes[ret][i].ptr = PHYS_MAP_NODE_NIL; > } > @@ -168,13 +176,13 @@ static void phys_page_set_level(PhysPageEntry *lp, > hwaddr *index, > { > PhysPageEntry *p; > int i; > - hwaddr step = (hwaddr)1 << (level * L2_BITS); > + hwaddr step = (hwaddr)1 << (level * P_L2_BITS); > > if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) { > lp->ptr = phys_map_node_alloc(); > p = next_map.nodes[lp->ptr]; > if (level == 0) { > - for (i = 0; i < L2_SIZE; i++) { > + for (i = 0; i < P_L2_SIZE; i++) { > p[i].is_leaf = 1; > p[i].ptr = PHYS_SECTION_UNASSIGNED; > } > @@ -182,9 +190,9 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr > *index, > } else { > p = next_map.nodes[lp->ptr]; > } > - lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)]; > + lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)]; > > - while (*nb && lp < &p[L2_SIZE]) { > + while (*nb && lp < &p[P_L2_SIZE]) { > if ((*index & (step - 1)) == 0 && *nb >= step) { > lp->is_leaf = true; > lp->ptr = leaf; > @@ -218,7 +226,7 @@ static MemoryRegionSection *phys_page_find(PhysPageEntry > lp, hwaddr index, > return §ions[PHYS_SECTION_UNASSIGNED]; > } > p = nodes[lp.ptr]; > - lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)]; > + lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)]; > } > return §ions[lp.ptr]; > } > diff --git a/translate-all.c b/translate-all.c > index aeda54d..1c63d78 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -96,12 +96,16 @@ typedef struct PageDesc { > # define L1_MAP_ADDR_SPACE_BITS TARGET_VIRT_ADDR_SPACE_BITS > #endif > > +/* Size of the L2 (and L3, etc) page tables. */ > +#define V_L2_BITS 10 > +#define V_L2_SIZE (1 << V_L2_BITS) > + > /* The bits remaining after N lower levels of page tables. */ > #define V_L1_BITS_REM \ > - ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS) > + ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS) > > #if V_L1_BITS_REM < 4 > -#define V_L1_BITS (V_L1_BITS_REM + L2_BITS) > +#define V_L1_BITS (V_L1_BITS_REM + V_L2_BITS) > #else > #define V_L1_BITS V_L1_BITS_REM > #endif > @@ -395,18 +399,18 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, > int alloc) > lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1)); > > /* Level 2..N-1. */ > - for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) { > + for (i = V_L1_SHIFT / V_L2_BITS - 1; i > 0; i--) { > void **p = *lp; > > if (p == NULL) { > if (!alloc) { > return NULL; > } > - ALLOC(p, sizeof(void *) * L2_SIZE); > + ALLOC(p, sizeof(void *) * V_L2_SIZE); > *lp = p; > } > > - lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1)); > + lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1)); > } > > pd = *lp; > @@ -414,13 +418,13 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, > int alloc) > if (!alloc) { > return NULL; > } > - ALLOC(pd, sizeof(PageDesc) * L2_SIZE); > + ALLOC(pd, sizeof(PageDesc) * V_L2_SIZE); > *lp = pd; > } > > #undef ALLOC > > - return pd + (index & (L2_SIZE - 1)); > + return pd + (index & (V_L2_SIZE - 1)); > } > > static inline PageDesc *page_find(tb_page_addr_t index) > @@ -655,14 +659,14 @@ static void page_flush_tb_1(int level, void **lp) > if (level == 0) { > PageDesc *pd = *lp; > > - for (i = 0; i < L2_SIZE; ++i) { > + for (i = 0; i < V_L2_SIZE; ++i) { > pd[i].first_tb = NULL; > invalidate_page_bitmap(pd + i); > } > } else { > void **pp = *lp; > > - for (i = 0; i < L2_SIZE; ++i) { > + for (i = 0; i < V_L2_SIZE; ++i) { > page_flush_tb_1(level - 1, pp + i); > } > } > @@ -673,7 +677,7 @@ static void page_flush_tb(void) > int i; > > for (i = 0; i < V_L1_SIZE; i++) { > - page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i); > + page_flush_tb_1(V_L1_SHIFT / V_L2_BITS - 1, l1_map + i); > } > } > > @@ -1600,7 +1604,7 @@ static int walk_memory_regions_1(struct > walk_memory_regions_data *data, > if (level == 0) { > PageDesc *pd = *lp; > > - for (i = 0; i < L2_SIZE; ++i) { > + for (i = 0; i < V_L2_SIZE; ++i) { > int prot = pd[i].flags; > > pa = base | (i << TARGET_PAGE_BITS); > @@ -1614,9 +1618,9 @@ static int walk_memory_regions_1(struct > walk_memory_regions_data *data, > } else { > void **pp = *lp; > > - for (i = 0; i < L2_SIZE; ++i) { > + for (i = 0; i < V_L2_SIZE; ++i) { > pa = base | ((abi_ulong)i << > - (TARGET_PAGE_BITS + L2_BITS * level)); > + (TARGET_PAGE_BITS + V_L2_BITS * level)); > rc = walk_memory_regions_1(data, pa, level - 1, pp + i); > if (rc != 0) { > return rc; > @@ -1639,7 +1643,7 @@ int walk_memory_regions(void *priv, > walk_memory_regions_fn fn) > > for (i = 0; i < V_L1_SIZE; i++) { > int rc = walk_memory_regions_1(&data, (abi_ulong)i << V_L1_SHIFT, > - V_L1_SHIFT / L2_BITS - 1, l1_map + i); > + V_L1_SHIFT / V_L2_BITS - 1, l1_map + > i); > > if (rc != 0) { > return rc; > diff --git a/translate-all.h b/translate-all.h > index 5c38819..f7e5932 100644 > --- a/translate-all.h > +++ b/translate-all.h > @@ -19,13 +19,6 @@ > #ifndef TRANSLATE_ALL_H > #define TRANSLATE_ALL_H > > -/* Size of the L2 (and L3, etc) page tables. */ > -#define L2_BITS 10 > -#define L2_SIZE (1 << L2_BITS) > - > -#define P_L2_LEVELS \ > - (((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / L2_BITS) + 1) > - > /* translate-all.c */ > void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len); > void cpu_unlink_tb(CPUState *cpu); > > Paolo