Hi!

I made some testing how fast my system can move data to VRAM/GTT and I
got very interestig results:

(II) RADEON(0): BENCH: copy 3129344 bytes to vram took 78595us,
resulting in 39Mbps
(II) RADEON(0): BENCH: copy 3129344 bytes to gtt took 11411us,
resulting in 274Mbps
(II) RADEON(0): BENCH: copy 3129344 bytes to gtt took 8431us,
resulting in 371Mbps
(II) RADEON(0): BENCH: copy 3129344 bytes to vram took 75773us,
resulting in 41Mbps
(II) RADEON(0): BENCH: copy 3129344 gtt to vram took 3143us, resulting
in 995Mbps


So direct write to VRAM operates only at 40 mega bytes per second.
That is insanely slow. I hope we won't hit that kind of limit anywhere
in any code.

I did check that VRAM is WC cached in /proc/mtrr. But still it is
surprising slow.

But most insane result is that CPU can only write to GTT max 371 Mbps
while GPU can do gtt to vram at 995Mbps. More insane in that results
is that I was nearly sure that my memory can't operate that fast but
still when code to check vram content runs everything is correctly in
vram! What did GPU/AGP did to cheat that much? Is there some error in
my test case?

System is:
Athlon mobility XP at 2.1Ghz
333MHz ddr memory
AGP 8x bus to mobility radeon 9200.

Of course the most important part of message is the attachments. diff
file contains benchmark code. Also another attachment is my shell
scriptthat I used to run the test. It sets my cpu to performance mode
to make sure that cpu frequency changes won't affect the results

Pauli
diff --git a/src/radeon_kms.c b/src/radeon_kms.c
index e9e5b5d..65bab82 100644
--- a/src/radeon_kms.c
+++ b/src/radeon_kms.c
@@ -594,6 +594,170 @@ void RADEONFreeScreen_KMS(int scrnIndex, int flags)
     RADEONFreeRec(pScrn);
 }
 
+#include <sys/time.h>
+#include "radeon_macros.h"
+
+
+#define BEGIN_ACCEL(n)		BEGIN_RING(2*(n))
+#define OUT_ACCEL_REG(reg, val) OUT_RING_REG(reg, val)
+#define FINISH_ACCEL()          ADVANCE_RING()
+
+#define OUT_RELOC(x, read, write) OUT_RING_RELOC(x, read, write)
+
+#define ACCEL_PREAMBLE()						\
+	RING_LOCALS; \
+	RADEONCP_REFRESH(pScrn, info)
+
+typedef struct radeon_bo bo;
+
+static void benchmark_accel(ScrnInfoPtr pScrn, bo *dst, bo *src)
+{
+	RADEONInfoPtr info = RADEONPTR(pScrn);
+	int r;
+	ACCEL_PREAMBLE();
+	radeon_cs_space_reset_bos(info->cs);
+	radeon_cs_space_add_persistent_bo(info->cs, src, RADEON_GEM_DOMAIN_GTT, 0);
+	radeon_cs_space_add_persistent_bo(info->cs, dst, RADEON_GEM_DOMAIN_VRAM, 0);
+	r = radeon_cs_space_check(info->cs);
+
+	if (r)
+		xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+				"BENCH: space check failed! BAD!\n");
+
+	BEGIN_ACCEL_RELOC(6, 2);
+	OUT_ACCEL_REG(RADEON_DP_GUI_MASTER_CNTL,
+			RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+			RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+			RADEON_GMC_BRUSH_NONE |
+			(ATI_DATATYPE_ARGB8888 << 8) |
+			RADEON_GMC_SRC_DATATYPE_COLOR |
+			RADEON_ROP3_S |
+			RADEON_DP_SRC_SOURCE_MEMORY |
+			RADEON_GMC_CLR_CMP_CNTL_DIS |
+			RADEON_GMC_WR_MSK_DIS);
+	OUT_ACCEL_REG(RADEON_SRC_PITCH_OFFSET, (1024 >> 4) << 22);
+	OUT_RELOC(src, RADEON_GEM_DOMAIN_GTT, 0);
+
+	OUT_ACCEL_REG(RADEON_DST_PITCH_OFFSET, (1024 >> 4) << 22);
+	OUT_RELOC(dst, 0, RADEON_GEM_DOMAIN_VRAM);
+
+	OUT_ACCEL_REG(RADEON_SRC_Y_X, (0 << 16) | 0);
+	OUT_ACCEL_REG(RADEON_DST_Y_X, (0 << 16) | 0);
+	OUT_ACCEL_REG(RADEON_DST_HEIGHT_WIDTH, (768 << 16) | 1024);
+	FINISH_ACCEL();
+	BEGIN_ACCEL(2);
+	OUT_ACCEL_REG(RADEON_DSTCACHE_CTLSTAT, RADEON_RB2D_DC_FLUSH_ALL);
+	OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
+			RADEON_WAIT_2D_IDLECLEAN | RADEON_WAIT_DMA_GUI_IDLE);
+
+	FINISH_ACCEL();
+
+	FLUSH_RING();
+
+	/* Wait for copy to finnish */
+	radeon_bo_map(dst, 0);
+	radeon_bo_unmap(dst);
+
+}
+
+static void benchmark(ScrnInfoPtr pScrn)
+{
+	RADEONInfoPtr info = RADEONPTR(pScrn);
+	const long long size = 1024*764*4;
+	int i;
+	typedef struct timeval t;
+	t start;
+	t end;
+	long long total;
+	bo *vram = radeon_bo_open(info->bufmgr, 0, size, 0,
+			RADEON_GEM_DOMAIN_VRAM, 0);
+	bo *gtt = radeon_bo_open(info->bufmgr, 0, size, 0,
+			RADEON_GEM_DOMAIN_GTT, 0);
+	radeon_bo_map(vram, 0);
+	radeon_bo_map(gtt, 0);
+	uint32_t *data = malloc(size);
+	/* Generate test data */
+	for (i = 0; i < size/sizeof(data[0]); ++i)
+		data[i] = rand();
+
+	gettimeofday(&start, 0);
+	memmove(vram->ptr, data, size);
+	gettimeofday(&end,0);
+
+	total = (end.tv_sec - start.tv_sec)*1000*1000
+		+ (end.tv_usec - start.tv_usec);
+
+	xf86DrvMsg(pScrn->scrnIndex, X_INFO,
+			"BENCH: copy %lld bytes to vram took %lldus, resulting in %lldMbps\n",
+			size, total, (size)/total);
+
+	gettimeofday(&start, 0);
+	memmove(gtt->ptr, data, size);
+	gettimeofday(&end, 0);
+
+	total = (end.tv_sec - start.tv_sec)*1000*1000
+		+ (end.tv_usec - start.tv_usec);
+
+	xf86DrvMsg(pScrn->scrnIndex, X_INFO,
+			"BENCH: copy %lld bytes to gtt took %lldus, resulting in %lldMbps\n",
+			size, total, (size)/total);
+
+	gettimeofday(&start, 0);
+	memmove(gtt->ptr, data, size);
+	gettimeofday(&end, 0);
+
+	total = (end.tv_sec - start.tv_sec)*1000*1000
+		+ (end.tv_usec - start.tv_usec);
+
+	xf86DrvMsg(pScrn->scrnIndex, X_INFO,
+			"BENCH: copy %lld bytes to gtt took %lldus, resulting in %lldMbps\n",
+			size, total, (size)/total);
+
+
+	gettimeofday(&start, 0);
+	memmove(vram->ptr, data, size);
+	gettimeofday(&end,0);
+
+	total = (end.tv_sec - start.tv_sec)*1000*1000
+		+ (end.tv_usec - start.tv_usec);
+
+	xf86DrvMsg(pScrn->scrnIndex, X_INFO,
+			"BENCH: copy %lld bytes to vram took %lldus, resulting in %lldMbps\n",
+			size, total, (size)/total);
+
+	/* Generate test data */
+	for (i = 0; i < size/sizeof(data[0]); ++i)
+		data[i] = rand();
+	memmove(gtt->ptr, data, size);
+	radeon_bo_unmap(vram);
+	radeon_bo_unmap(gtt);
+
+	gettimeofday(&start, 0);
+	benchmark_accel(pScrn, vram, gtt);
+	gettimeofday(&end, 0);
+	radeon_bo_map(vram, 0);
+
+	for (i = size/4 - 1; i >= 0; i--) {
+		int32_t *ptr = vram->ptr;
+		if ( data[i] != ptr[i] ) {
+		xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+				"BENCH: Copy error\n");
+		break;
+		}
+	}
+
+	total = (end.tv_sec - start.tv_sec)*1000*1000
+		+ (end.tv_usec - start.tv_usec);
+
+	xf86DrvMsg(pScrn->scrnIndex, X_INFO,
+			"BENCH: copy %lld gtt to vram took %lldus, resulting in %lldMbps\n",
+			size, total, (size)/total);
+
+	radeon_bo_unmap(vram);
+	radeon_bo_unref(vram);
+	radeon_bo_unref(gtt);
+}
+
 Bool RADEONScreenInit_KMS(int scrnIndex, ScreenPtr pScreen,
 			  int argc, char **argv)
 {
@@ -818,6 +982,8 @@ Bool RADEONScreenInit_KMS(int scrnIndex, ScreenPtr pScreen,
     info->accel_state->XInited3D = FALSE;
     info->accel_state->engineMode = EXA_ENGINEMODE_UNKNOWN;
 
+    benchmark(pScrn);
+
     return TRUE;
 }
 

Attachment: run_test.sh
Description: Bourne shell script

_______________________________________________
xorg-driver-ati mailing list
[email protected]
http://lists.x.org/mailman/listinfo/xorg-driver-ati

Reply via email to