Re: [Mesa-dev] [PATCH v2 04/16] intel: aubinator: drop the 1Tb GTT mapping
Patch is Reviewed-by: Rafael Antognolli On Tue, Jun 19, 2018 at 02:45:19PM +0100, Lionel Landwerlin wrote: > Now that we're softpinning the address of our BOs in anv & i965, the > addresses selected start at the top of the addressing space. This is a > problem for the current implementation of aubinator which uses only a > 40bit mmapped address space. > > This change keeps track of all the memory writes from the aub file and > fetch them on request by the batch decoder. As a result we can get rid > of the 1<<40 mmapped address space and only rely on the mmap aub file > \o/ > > Signed-off-by: Lionel Landwerlin > --- > src/intel/tools/aubinator.c | 130 > 1 file changed, 72 insertions(+), 58 deletions(-) > > diff --git a/src/intel/tools/aubinator.c b/src/intel/tools/aubinator.c > index 0438f96cd1b..f70038376be 100644 > --- a/src/intel/tools/aubinator.c > +++ b/src/intel/tools/aubinator.c > @@ -37,10 +37,12 @@ > #include > #include > > +#include "util/list.h" > #include "util/macros.h" > > #include "common/gen_decoder.h" > #include "common/gen_disasm.h" > +#include "common/gen_gem.h" > #include "intel_aub.h" > > /* Below is the only command missing from intel_aub.h in libdrm > @@ -68,8 +70,12 @@ char *input_file = NULL, *xml_path = NULL; > struct gen_device_info devinfo; > struct gen_batch_decode_ctx batch_ctx; > > -uint64_t gtt_size, gtt_end; > -void *gtt; > +struct bo_map { > + struct list_head link; > + struct gen_batch_decode_bo bo; > +}; > + > +static struct list_head maps; > > FILE *outfile; > > @@ -85,10 +91,32 @@ field(uint32_t value, int start, int end) > > struct brw_instruction; > > -static inline int > -valid_offset(uint32_t offset) > +static void > +add_gtt_bo_map(struct gen_batch_decode_bo bo) > { > - return offset < gtt_end; > + struct bo_map *m = calloc(1, sizeof(*m)); > + > + m->bo = bo; > + list_add(&m->link, &maps); > +} > + > +static void > +clear_bo_maps(void) > +{ > + list_for_each_entry_safe(struct bo_map, i, &maps, link) { > + list_del(&i->link); > + free(i); > + } > +} > + > +static struct gen_batch_decode_bo > +get_gen_batch_bo(void *user_data, uint64_t address) > +{ > + list_for_each_entry(struct bo_map, i, &maps, link) > + if (i->bo.addr <= address && i->bo.addr + i->bo.size > address) > + return i->bo; > + > + return (struct gen_batch_decode_bo) { .map = NULL }; > } > > #define GEN_ENGINE_RENDER 1 > @@ -100,26 +128,23 @@ handle_trace_block(uint32_t *p) > int operation = p[1] & AUB_TRACE_OPERATION_MASK; > int type = p[1] & AUB_TRACE_TYPE_MASK; > int address_space = p[1] & AUB_TRACE_ADDRESS_SPACE_MASK; > - uint64_t offset = p[3]; > - uint32_t size = p[4]; > int header_length = p[0] & 0x; > - uint32_t *data = p + header_length + 2; > int engine = GEN_ENGINE_RENDER; > - > - if (devinfo.gen >= 8) > - offset += (uint64_t) p[5] << 32; > + struct gen_batch_decode_bo bo = { > + .map = p + header_length + 2, > + /* Addresses written by aubdump here are in canonical form but the > batch > + * decoder always gives us addresses with the top 16bits zeroed, so do > + * the same here. > + */ > + .addr = gen_48b_address((devinfo.gen >= 8 ? ((uint64_t) p[5] << 32) : > 0) | > + ((uint64_t) p[3])), > + .size = p[4], > + }; > > switch (operation) { > case AUB_TRACE_OP_DATA_WRITE: > - if (address_space != AUB_TRACE_MEMTYPE_GTT) > - break; > - if (gtt_size < offset + size) { > - fprintf(stderr, "overflow gtt space: %s\n", strerror(errno)); > - exit(EXIT_FAILURE); > - } > - memcpy((char *) gtt + offset, data, size); > - if (gtt_end < offset + size) > - gtt_end = offset + size; > + if (address_space == AUB_TRACE_MEMTYPE_GTT) > + add_gtt_bo_map(bo); >break; > case AUB_TRACE_OP_COMMAND_WRITE: >switch (type) { > @@ -135,27 +160,13 @@ handle_trace_block(uint32_t *p) >} > >(void)engine; /* TODO */ > - gen_print_batch(&batch_ctx, data, size, 0); > + gen_print_batch(&batch_ctx, bo.map, bo.size, 0); > > - gtt_end = 0; > + clear_bo_maps(); >break; > } > } > > -static struct gen_batch_decode_bo > -get_gen_batch_bo(void *user_data, uint64_t address) > -{ > - if (address > gtt_end) > - return (struct gen_batch_decode_bo) { .map = NULL }; > - > - /* We really only have one giant address range */ > - return (struct gen_batch_decode_bo) { > - .addr = 0, > - .map = gtt, > - .size = gtt_size > - }; > -} > - > static void > aubinator_init(uint16_t aub_pci_id, const char *app_name) > { > @@ -305,34 +316,44 @@ handle_memtrace_reg_write(uint32_t *p) > } > > const uint32_t pphwsp_size = 4096; > - uint32_t *context = (uint32_t*)(gtt + (context_descriptor & 0xf000) + > pphwsp_size); > + uint32_t pphw
[Mesa-dev] [PATCH v2 04/16] intel: aubinator: drop the 1Tb GTT mapping
Now that we're softpinning the address of our BOs in anv & i965, the addresses selected start at the top of the addressing space. This is a problem for the current implementation of aubinator which uses only a 40bit mmapped address space. This change keeps track of all the memory writes from the aub file and fetch them on request by the batch decoder. As a result we can get rid of the 1<<40 mmapped address space and only rely on the mmap aub file \o/ Signed-off-by: Lionel Landwerlin --- src/intel/tools/aubinator.c | 130 1 file changed, 72 insertions(+), 58 deletions(-) diff --git a/src/intel/tools/aubinator.c b/src/intel/tools/aubinator.c index 0438f96cd1b..f70038376be 100644 --- a/src/intel/tools/aubinator.c +++ b/src/intel/tools/aubinator.c @@ -37,10 +37,12 @@ #include #include +#include "util/list.h" #include "util/macros.h" #include "common/gen_decoder.h" #include "common/gen_disasm.h" +#include "common/gen_gem.h" #include "intel_aub.h" /* Below is the only command missing from intel_aub.h in libdrm @@ -68,8 +70,12 @@ char *input_file = NULL, *xml_path = NULL; struct gen_device_info devinfo; struct gen_batch_decode_ctx batch_ctx; -uint64_t gtt_size, gtt_end; -void *gtt; +struct bo_map { + struct list_head link; + struct gen_batch_decode_bo bo; +}; + +static struct list_head maps; FILE *outfile; @@ -85,10 +91,32 @@ field(uint32_t value, int start, int end) struct brw_instruction; -static inline int -valid_offset(uint32_t offset) +static void +add_gtt_bo_map(struct gen_batch_decode_bo bo) { - return offset < gtt_end; + struct bo_map *m = calloc(1, sizeof(*m)); + + m->bo = bo; + list_add(&m->link, &maps); +} + +static void +clear_bo_maps(void) +{ + list_for_each_entry_safe(struct bo_map, i, &maps, link) { + list_del(&i->link); + free(i); + } +} + +static struct gen_batch_decode_bo +get_gen_batch_bo(void *user_data, uint64_t address) +{ + list_for_each_entry(struct bo_map, i, &maps, link) + if (i->bo.addr <= address && i->bo.addr + i->bo.size > address) + return i->bo; + + return (struct gen_batch_decode_bo) { .map = NULL }; } #define GEN_ENGINE_RENDER 1 @@ -100,26 +128,23 @@ handle_trace_block(uint32_t *p) int operation = p[1] & AUB_TRACE_OPERATION_MASK; int type = p[1] & AUB_TRACE_TYPE_MASK; int address_space = p[1] & AUB_TRACE_ADDRESS_SPACE_MASK; - uint64_t offset = p[3]; - uint32_t size = p[4]; int header_length = p[0] & 0x; - uint32_t *data = p + header_length + 2; int engine = GEN_ENGINE_RENDER; - - if (devinfo.gen >= 8) - offset += (uint64_t) p[5] << 32; + struct gen_batch_decode_bo bo = { + .map = p + header_length + 2, + /* Addresses written by aubdump here are in canonical form but the batch + * decoder always gives us addresses with the top 16bits zeroed, so do + * the same here. + */ + .addr = gen_48b_address((devinfo.gen >= 8 ? ((uint64_t) p[5] << 32) : 0) | + ((uint64_t) p[3])), + .size = p[4], + }; switch (operation) { case AUB_TRACE_OP_DATA_WRITE: - if (address_space != AUB_TRACE_MEMTYPE_GTT) - break; - if (gtt_size < offset + size) { - fprintf(stderr, "overflow gtt space: %s\n", strerror(errno)); - exit(EXIT_FAILURE); - } - memcpy((char *) gtt + offset, data, size); - if (gtt_end < offset + size) - gtt_end = offset + size; + if (address_space == AUB_TRACE_MEMTYPE_GTT) + add_gtt_bo_map(bo); break; case AUB_TRACE_OP_COMMAND_WRITE: switch (type) { @@ -135,27 +160,13 @@ handle_trace_block(uint32_t *p) } (void)engine; /* TODO */ - gen_print_batch(&batch_ctx, data, size, 0); + gen_print_batch(&batch_ctx, bo.map, bo.size, 0); - gtt_end = 0; + clear_bo_maps(); break; } } -static struct gen_batch_decode_bo -get_gen_batch_bo(void *user_data, uint64_t address) -{ - if (address > gtt_end) - return (struct gen_batch_decode_bo) { .map = NULL }; - - /* We really only have one giant address range */ - return (struct gen_batch_decode_bo) { - .addr = 0, - .map = gtt, - .size = gtt_size - }; -} - static void aubinator_init(uint16_t aub_pci_id, const char *app_name) { @@ -305,34 +316,44 @@ handle_memtrace_reg_write(uint32_t *p) } const uint32_t pphwsp_size = 4096; - uint32_t *context = (uint32_t*)(gtt + (context_descriptor & 0xf000) + pphwsp_size); + uint32_t pphwsp_addr = context_descriptor & 0xf000; + struct gen_batch_decode_bo pphwsp_bo = get_gen_batch_bo(NULL, pphwsp_addr); + uint32_t *context = (uint32_t *)((uint8_t *)pphwsp_bo.map + +(pphwsp_bo.addr - pphwsp_addr) + +pphwsp_size); + uint32_t ring_buffer_head = context[5]; uint32_t ring_buffer_tail = context[7]; uint32_t ring_buffer_start = co