The following patch implements a fairly light set of timing statements aimed at understanding external sort performance. There is no attempt to alter the algorithms.
Each major point in the algorithms is marked as shown in this example: postgres=# set debug_sort=true; SET postgres=# explain analyze select * from test2 order by col1,col2; NOTICE: tuplesort begin work_mem= 1024 NOTICE: +0 secs heap sort nkeys= 2 NOTICE: +0 secs switching to external sort NOTICE: +1129 secs starting build of next run NOTICE: +2229 secs run building complete nruns= 2 NOTICE: +2229 secs merging runs with 6 tapes ... NOTICE: +3036 secs starting final merge I'll add other information, as requested. The "6 tapes" is currently hardcoded, though is included in expectation of implementing variable numbers of tapes. I'm not sure if I got the header file correct for full portability of gettimeofday(). Please correct me, if this is the case. Please post sort performance data back via this post. Best Regards, Simon Riggs
Index: src/backend/utils/misc/guc.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v retrieving revision 1.289 diff -c -c -r1.289 guc.c *** src/backend/utils/misc/guc.c 24 Sep 2005 22:54:39 -0000 1.289 --- src/backend/utils/misc/guc.c 2 Oct 2005 18:00:45 -0000 *************** *** 92,97 **** --- 92,98 ---- extern int CommitSiblings; extern char *default_tablespace; extern bool fullPageWrites; + extern bool debug_sort; static const char *assign_log_destination(const char *value, bool doit, GucSource source); *************** *** 764,770 **** false, NULL, NULL }, #endif ! { {"log_hostname", PGC_SIGHUP, LOGGING_WHAT, gettext_noop("Logs the host name in the connection logs."), --- 765,779 ---- false, NULL, NULL }, #endif ! { ! {"debug_sort", PGC_USERSET, DEVELOPER_OPTIONS, ! gettext_noop("no description available"), ! NULL, ! GUC_NOT_IN_SAMPLE ! }, ! &debug_sort, ! false, NULL, NULL ! }, { {"log_hostname", PGC_SIGHUP, LOGGING_WHAT, gettext_noop("Logs the host name in the connection logs."), Index: src/backend/utils/sort/tuplesort.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v retrieving revision 1.50 diff -c -c -r1.50 tuplesort.c *** src/backend/utils/sort/tuplesort.c 23 Sep 2005 15:36:57 -0000 1.50 --- src/backend/utils/sort/tuplesort.c 2 Oct 2005 18:00:46 -0000 *************** *** 394,399 **** --- 394,402 ---- */ static Tuplesortstate *qsort_tuplesortstate; + /* debug variables */ + bool debug_sort = false; + struct timeval start_t, current_t; /* * tuplesort_begin_xxx *************** *** 422,427 **** --- 425,437 ---- state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + if (debug_sort) + { + ereport(NOTICE, + (errmsg("tuplesort begin work_mem= %u", workMem))); + gettimeofday(&start_t, NULL); + } + state->status = TSS_INITIAL; state->randomAccess = randomAccess; state->availMem = workMem * 1024L; *************** *** 454,459 **** --- 464,473 ---- Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); int i; + if (debug_sort) + ereport(NOTICE, + (errmsg("+0 secs heap sort nkeys= %u", nkeys))); + AssertArg(nkeys > 0); state->comparetup = comparetup_heap; *************** *** 499,504 **** --- 513,522 ---- { Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + if (debug_sort) + ereport(NOTICE, + (errmsg("+0 secs index sort"))); + state->comparetup = comparetup_index; state->copytup = copytup_index; state->writetup = writetup_index; *************** *** 666,671 **** --- 684,697 ---- /* * Nope; time to switch to tape-based operation. */ + if (debug_sort) + { + gettimeofday(¤t_t, NULL); + ereport(NOTICE, + (errmsg("+%ld secs switching to external sort", + (long) (current_t.tv_sec - start_t.tv_sec), + state->currentRun))); + } inittapes(state); /* *************** *** 1042,1047 **** --- 1068,1082 ---- int j; int a; + if (debug_sort) + { + gettimeofday(¤t_t, NULL); + ereport(NOTICE, + (errmsg("+%ld secs starting build of next run", + (long) (current_t.tv_sec - start_t.tv_sec), + state->currentRun))); + } + /* Step D3: advance j (destTape) */ if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) { *************** *** 1082,1087 **** --- 1117,1131 ---- Assert(state->status == TSS_BUILDRUNS); Assert(state->memtupcount == 0); + if (debug_sort) + { + gettimeofday(¤t_t, NULL); + ereport(NOTICE, + (errmsg("+%ld secs run building complete nruns= %u", + (long) (current_t.tv_sec - start_t.tv_sec), + state->currentRun))); + } + /* * If we produced only one initial run (quite likely if the total data * volume is between 1X and 2X workMem), we can just use that tape as *************** *** 1093,1100 **** --- 1137,1158 ---- /* must freeze and rewind the finished output tape */ LogicalTapeFreeze(state->tapeset, state->result_tape); state->status = TSS_SORTEDONTAPE; + if (debug_sort) + { + gettimeofday(¤t_t, NULL); + ereport(NOTICE, + (errmsg("+%ld secs no final merge required", + (long) (current_t.tv_sec - start_t.tv_sec)))); + } return; } + if (debug_sort) + { + gettimeofday(¤t_t, NULL); + ereport(NOTICE, + (errmsg("+%ld secs merging runs with 6 tapes", + (long) (current_t.tv_sec - start_t.tv_sec)))); + } /* End of step D2: rewind all output tapes to prepare for merging */ for (tapenum = 0; tapenum < TAPERANGE; tapenum++) *************** *** 1126,1131 **** --- 1184,1196 ---- /* Initialize for the final merge pass */ beginmerge(state); state->status = TSS_FINALMERGE; + if (debug_sort) + { + gettimeofday(¤t_t, NULL); + ereport(NOTICE, + (errmsg("+%ld secs starting final merge", + (long) (current_t.tv_sec - start_t.tv_sec)))); + } return; } if (allDummy) *************** *** 1177,1182 **** --- 1242,1254 ---- state->result_tape = state->tp_tapenum[TAPERANGE]; LogicalTapeFreeze(state->tapeset, state->result_tape); state->status = TSS_SORTEDONTAPE; + if (debug_sort) + { + gettimeofday(¤t_t, NULL); + ereport(NOTICE, + (errmsg("+%ld secs starting final merge", + (long) (current_t.tv_sec - start_t.tv_sec)))); + } } /* *************** *** 1243,1248 **** --- 1315,1327 ---- */ markrunend(state, destTape); state->tp_runs[TAPERANGE]++; + if (debug_sort) + { + gettimeofday(¤t_t, NULL); + ereport(NOTICE, + (errmsg("+%ld secs intermediate merge", + (long) (current_t.tv_sec - start_t.tv_sec)))); + } } /*
---------------------------(end of broadcast)--------------------------- TIP 1: if posting/reading through Usenet, please send an appropriate subscribe-nomail command to [EMAIL PROTECTED] so that your message can get through to the mailing list cleanly