[HACKERS] pg_upgrade with parallel tablespace copying

Bruce Momjian Mon, 07 Jan 2013 19:51:54 -0800

Pg_upgrade by default (without --link) copies heap/index files from the
old to new cluster.  This patch implements parallel heap/index file
copying in pg_upgrade using the --jobs option.  It uses the same
infrastructure used for pg_upgrade parallel dump/restore.  Here are the
performance results:


                   --- seconds ---
           GB        git    patched
            2       62.09    63.75
            4       95.93   107.22
            8      194.96   195.29
           16      494.38   348.93
           32      983.28   644.23
           64     2227.73  1244.08
          128     4735.83  2547.09

Because of the kernel cache, you only see a big win when the amount of
copy data exceeds the kernel cache.  For testing, I used a 24GB, 16-core
machine with two magnetic disks with one tablespace on each.  Using more
tablespaces would yield larger improvements.  My test script is
attached.  

I consider this patch ready for application.  This is the last
pg_upgrade performance improvement idea I am considering.

-- 
  Bruce Momjian  <[email protected]>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + It's impossible for everything to be true. +

diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
new file mode 100644
index 59f8fd0..1780788
*** a/contrib/pg_upgrade/check.c
--- b/contrib/pg_upgrade/check.c
*************** create_script_for_old_cluster_deletion(c
*** 606,612 ****
  	fprintf(script, RMDIR_CMD " %s\n", fix_path_separator(old_cluster.pgdata));
  
  	/* delete old cluster's alternate tablespaces */
! 	for (tblnum = 0; tblnum < os_info.num_tablespaces; tblnum++)
  	{
  		/*
  		 * Do the old cluster's per-database directories share a directory
--- 606,612 ----
  	fprintf(script, RMDIR_CMD " %s\n", fix_path_separator(old_cluster.pgdata));
  
  	/* delete old cluster's alternate tablespaces */
! 	for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++)
  	{
  		/*
  		 * Do the old cluster's per-database directories share a directory
*************** create_script_for_old_cluster_deletion(c
*** 621,634 ****
  			/* remove PG_VERSION? */
  			if (GET_MAJOR_VERSION(old_cluster.major_version) <= 804)
  				fprintf(script, RM_CMD " %s%s%cPG_VERSION\n",
! 						fix_path_separator(os_info.tablespaces[tblnum]), 
  						fix_path_separator(old_cluster.tablespace_suffix),
  						PATH_SEPARATOR);
  
  			for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++)
  			{
  				fprintf(script, RMDIR_CMD " %s%s%c%d\n",
! 						fix_path_separator(os_info.tablespaces[tblnum]),
  						fix_path_separator(old_cluster.tablespace_suffix),
  						PATH_SEPARATOR, old_cluster.dbarr.dbs[dbnum].db_oid);
  			}
--- 621,634 ----
  			/* remove PG_VERSION? */
  			if (GET_MAJOR_VERSION(old_cluster.major_version) <= 804)
  				fprintf(script, RM_CMD " %s%s%cPG_VERSION\n",
! 						fix_path_separator(os_info.old_tablespaces[tblnum]), 
  						fix_path_separator(old_cluster.tablespace_suffix),
  						PATH_SEPARATOR);
  
  			for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++)
  			{
  				fprintf(script, RMDIR_CMD " %s%s%c%d\n",
! 						fix_path_separator(os_info.old_tablespaces[tblnum]),
  						fix_path_separator(old_cluster.tablespace_suffix),
  						PATH_SEPARATOR, old_cluster.dbarr.dbs[dbnum].db_oid);
  			}
*************** create_script_for_old_cluster_deletion(c
*** 640,646 ****
  			 * or a version-specific subdirectory.
  			 */
  			fprintf(script, RMDIR_CMD " %s%s\n",
! 					fix_path_separator(os_info.tablespaces[tblnum]), 
  					fix_path_separator(old_cluster.tablespace_suffix));
  	}
  
--- 640,646 ----
  			 * or a version-specific subdirectory.
  			 */
  			fprintf(script, RMDIR_CMD " %s%s\n",
! 					fix_path_separator(os_info.old_tablespaces[tblnum]), 
  					fix_path_separator(old_cluster.tablespace_suffix));
  	}
  
diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c
new file mode 100644
index 0c11ff8..7fd4584
*** a/contrib/pg_upgrade/info.c
--- b/contrib/pg_upgrade/info.c
*************** create_rel_filename_map(const char *old_
*** 106,125 ****
  		 * relation belongs to the default tablespace, hence relfiles should
  		 * exist in the data directories.
  		 */
! 		snprintf(map->old_dir, sizeof(map->old_dir), "%s/base/%u", old_data,
! 				 old_db->db_oid);
! 		snprintf(map->new_dir, sizeof(map->new_dir), "%s/base/%u", new_data,
! 				 new_db->db_oid);
  	}
  	else
  	{
  		/* relation belongs to a tablespace, so use the tablespace location */
! 		snprintf(map->old_dir, sizeof(map->old_dir), "%s%s/%u", old_rel->tablespace,
! 				 old_cluster.tablespace_suffix, old_db->db_oid);
! 		snprintf(map->new_dir, sizeof(map->new_dir), "%s%s/%u", new_rel->tablespace,
! 				 new_cluster.tablespace_suffix, new_db->db_oid);
  	}
  
  	/*
  	 * old_relfilenode might differ from pg_class.oid (and hence
  	 * new_relfilenode) because of CLUSTER, REINDEX, or VACUUM FULL.
--- 106,130 ----
  		 * relation belongs to the default tablespace, hence relfiles should
  		 * exist in the data directories.
  		 */
! 		strlcpy(map->old_tablespace, old_data, sizeof(map->old_tablespace));
! 		strlcpy(map->new_tablespace, new_data, sizeof(map->new_tablespace));
! 		strlcpy(map->old_tablespace_suffix, "/base", sizeof(map->old_tablespace_suffix));
! 		strlcpy(map->new_tablespace_suffix, "/base", sizeof(map->new_tablespace_suffix));
  	}
  	else
  	{
  		/* relation belongs to a tablespace, so use the tablespace location */
! 		strlcpy(map->old_tablespace, old_rel->tablespace, sizeof(map->old_tablespace));
! 		strlcpy(map->new_tablespace, new_rel->tablespace, sizeof(map->new_tablespace));
! 		strlcpy(map->old_tablespace_suffix, old_cluster.tablespace_suffix,
! 				sizeof(map->old_tablespace_suffix));
! 		strlcpy(map->new_tablespace_suffix, new_cluster.tablespace_suffix,
! 				sizeof(map->new_tablespace_suffix));
  	}
  
+ 	map->old_db_oid = old_db->db_oid;
+ 	map->new_db_oid = new_db->db_oid;
+ 
  	/*
  	 * old_relfilenode might differ from pg_class.oid (and hence
  	 * new_relfilenode) because of CLUSTER, REINDEX, or VACUUM FULL.
diff --git a/contrib/pg_upgrade/parallel.c b/contrib/pg_upgrade/parallel.c
new file mode 100644
index 8ea36bc..d157511
*** a/contrib/pg_upgrade/parallel.c
--- b/contrib/pg_upgrade/parallel.c
*************** typedef struct {
*** 34,44 ****
  	char log_file[MAXPGPATH];
  	char opt_log_file[MAXPGPATH];
  	char cmd[MAX_STRING];
! } thread_arg;
  
! thread_arg **thread_args;
  
! DWORD win32_exec_prog(thread_arg *args);
  
  #endif
  
--- 34,57 ----
  	char log_file[MAXPGPATH];
  	char opt_log_file[MAXPGPATH];
  	char cmd[MAX_STRING];
! } exec_thread_arg;
  
! typedef struct {
! 	DbInfoArr *old_db_arr;
! 	DbInfoArr *new_db_arr;
! 	char old_pgdata[MAXPGPATH];
! 	char new_pgdata[MAXPGPATH];
! 	char old_tablespace[MAXPGPATH];
! } transfer_thread_arg;
  
! exec_thread_arg **exec_thread_args;
! transfer_thread_arg **transfer_thread_args;
! 
! /* track current thread_args struct so reap_child() can be used for all cases */
! void **cur_thread_args;
! 
! DWORD win32_exec_prog(exec_thread_arg *args);
! DWORD win32_transfer_all_new_dbs(transfer_thread_arg *args);
  
  #endif
  
*************** parallel_exec_prog(const char *log_file,
*** 58,64 ****
  	pid_t		child;
  #else
  	HANDLE		child;
! 	thread_arg	*new_arg;
  #endif
  
  	va_start(args, fmt);
--- 71,77 ----
  	pid_t		child;
  #else
  	HANDLE		child;
! 	exec_thread_arg	*new_arg;
  #endif
  
  	va_start(args, fmt);
*************** parallel_exec_prog(const char *log_file,
*** 71,77 ****
  	else
  	{
  		/* parallel */
! 
  		/* harvest any dead children */
  		while (reap_child(false) == true)
  			;
--- 84,92 ----
  	else
  	{
  		/* parallel */
! #ifdef WIN32
! 		cur_thread_args = (void **)exec_thread_args;
! #endif	
  		/* harvest any dead children */
  		while (reap_child(false) == true)
  			;
*************** parallel_exec_prog(const char *log_file,
*** 100,106 ****
  			int i;
  
  			thread_handles = pg_malloc(user_opts.jobs * sizeof(HANDLE));
! 			thread_args = pg_malloc(user_opts.jobs * sizeof(thread_arg *));
  
  			/*
  			 *	For safety and performance, we keep the args allocated during
--- 115,121 ----
  			int i;
  
  			thread_handles = pg_malloc(user_opts.jobs * sizeof(HANDLE));
! 			exec_thread_args = pg_malloc(user_opts.jobs * sizeof(exec_thread_arg *));
  
  			/*
  			 *	For safety and performance, we keep the args allocated during
*************** parallel_exec_prog(const char *log_file,
*** 108,118 ****
  			 *	in a thread different from the one that allocated it.
  			 */
  			for (i = 0; i < user_opts.jobs; i++)
! 				thread_args[i] = pg_malloc(sizeof(thread_arg));
  		}
  
  		/* use first empty array element */
! 		new_arg = thread_args[parallel_jobs-1];
  
  		/* Can only pass one pointer into the function, so use a struct */
  		strcpy(new_arg->log_file, log_file);
--- 123,133 ----
  			 *	in a thread different from the one that allocated it.
  			 */
  			for (i = 0; i < user_opts.jobs; i++)
! 				exec_thread_args[i] = pg_malloc(sizeof(exec_thread_arg));
  		}
  
  		/* use first empty array element */
! 		new_arg = exec_thread_args[parallel_jobs-1];
  
  		/* Can only pass one pointer into the function, so use a struct */
  		strcpy(new_arg->log_file, log_file);
*************** parallel_exec_prog(const char *log_file,
*** 134,140 ****
  
  #ifdef WIN32
  DWORD
! win32_exec_prog(thread_arg *args)
  {
  	int ret;
  
--- 149,155 ----
  
  #ifdef WIN32
  DWORD
! win32_exec_prog(exec_thread_arg *args)
  {
  	int ret;
  
*************** win32_exec_prog(thread_arg *args)
*** 147,152 ****
--- 162,273 ----
  
  
  /*
+  *	parallel_transfer_all_new_dbs
+  *
+  *	This has the same API as transfer_all_new_dbs, except it does parallel execution
+  *	by transfering multiple tablespaces in parallel
+  */
+ void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
+ 								   char *old_pgdata, char *new_pgdata,
+ 								   char *old_tablespace)
+ {
+ #ifndef WIN32
+ 	pid_t		child;
+ #else
+ 	HANDLE		child;
+ 	transfer_thread_arg	*new_arg;
+ #endif
+ 
+ 	if (user_opts.jobs <= 1)
+ 		/* throw_error must be true to allow jobs */
+ 		transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, NULL);
+ 	else
+ 	{
+ 		/* parallel */
+ #ifdef WIN32
+ 		cur_thread_args = (void **)transfer_thread_args;
+ #endif
+ 		/* harvest any dead children */
+ 		while (reap_child(false) == true)
+ 			;
+ 
+ 		/* must we wait for a dead child? */
+ 		if (parallel_jobs >= user_opts.jobs)
+ 			reap_child(true);
+ 			
+ 		/* set this before we start the job */
+ 		parallel_jobs++;
+ 	
+ 		/* Ensure stdio state is quiesced before forking */
+ 		fflush(NULL);
+ 
+ #ifndef WIN32
+ 		child = fork();
+ 		if (child == 0)
+ 		{
+ 			transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata,
+ 								 old_tablespace);
+ 			/* if we take another exit path, it will be non-zero */
+ 			/* use _exit to skip atexit() functions */
+ 			_exit(0);
+ 		}
+ 		else if (child < 0)
+ 			/* fork failed */
+ 			pg_log(PG_FATAL, "could not create worker process: %s\n", strerror(errno));
+ #else
+ 		if (thread_handles == NULL)
+ 		{
+ 			int i;
+ 
+ 			thread_handles = pg_malloc(user_opts.jobs * sizeof(HANDLE));
+ 			transfer_thread_args = pg_malloc(user_opts.jobs * sizeof(transfer_thread_arg *));
+ 
+ 			/*
+ 			 *	For safety and performance, we keep the args allocated during
+ 			 *	the entire life of the process, and we don't free the args
+ 			 *	in a thread different from the one that allocated it.
+ 			 */
+ 			for (i = 0; i < user_opts.jobs; i++)
+ 				transfer_thread_args[i] = pg_malloc(sizeof(transfer_thread_arg));
+ 		}
+ 
+ 		/* use first empty array element */
+ 		new_arg = transfer_thread_args[parallel_jobs-1];
+ 
+ 		/* Can only pass one pointer into the function, so use a struct */
+ 		new_arg->old_db_arr = old_db_arr;
+ 		new_arg->new_db_arr = new_db_arr;
+ 		strcpy(new_arg->old_pgdata, old_pgdata);
+ 		strcpy(new_arg->new_pgdata, new_pgdata);
+ 		strcpy(new_arg->old_tablespace, old_tablespace);
+ 
+ 		child = (HANDLE) _beginthreadex(NULL, 0, (void *) win32_exec_prog,
+ 						new_arg, 0, NULL);
+ 		if (child == 0)
+ 			pg_log(PG_FATAL, "could not create worker thread: %s\n", strerror(errno));
+ 
+ 		thread_handles[parallel_jobs-1] = child;
+ #endif
+ 	}
+ 
+ 	return;
+ }
+ 
+ 
+ #ifdef WIN32
+ DWORD
+ win32_transfer_all_new_dbs(transfer_thread_arg *args)
+ {
+ 	transfer_all_new_dbs(args->old_db_arr, args->new_db_arr, args->old_pgdata,
+ 						 args->new_pgdata, args->old_tablespace);
+ 
+ 	/* terminates thread */
+ 	return 0;
+ }
+ #endif
+ 
+ 
+ /*
   *	collect status from a completed worker child
   */
  bool
*************** reap_child(bool wait_for_child)
*** 195,201 ****
  	/*	Move last slot into dead child's position */
  	if (thread_num != parallel_jobs - 1)
  	{
! 		thread_arg *tmp_args;
  	
  		thread_handles[thread_num] = thread_handles[parallel_jobs - 1];
  
--- 316,322 ----
  	/*	Move last slot into dead child's position */
  	if (thread_num != parallel_jobs - 1)
  	{
! 		void *tmp_args;
  	
  		thread_handles[thread_num] = thread_handles[parallel_jobs - 1];
  
*************** reap_child(bool wait_for_child)
*** 205,213 ****
  		 *	reused by the next created thread.  Instead, the new thread
  		 *	will use the arg struct of the thread that just died.
  		 */
! 		tmp_args = thread_args[thread_num];
! 		thread_args[thread_num] = thread_args[parallel_jobs - 1];
! 		thread_args[parallel_jobs - 1] = tmp_args;
  	}
  #endif
  
--- 326,334 ----
  		 *	reused by the next created thread.  Instead, the new thread
  		 *	will use the arg struct of the thread that just died.
  		 */
! 		tmp_args = cur_thread_args[thread_num];
! 		cur_thread_args[thread_num] = cur_thread_args[parallel_jobs - 1];
! 		cur_thread_args[parallel_jobs - 1] = tmp_args;
  	}
  #endif
  
diff --git a/contrib/pg_upgrade/pg_upgrade.c b/contrib/pg_upgrade/pg_upgrade.c
new file mode 100644
index 70c749d..85997e5
*** a/contrib/pg_upgrade/pg_upgrade.c
--- b/contrib/pg_upgrade/pg_upgrade.c
*************** main(int argc, char **argv)
*** 133,139 ****
  	if (user_opts.transfer_mode == TRANSFER_MODE_LINK)
  		disable_old_cluster();
  
! 	transfer_all_new_dbs(&old_cluster.dbarr, &new_cluster.dbarr,
  						 old_cluster.pgdata, new_cluster.pgdata);
  
  	/*
--- 133,139 ----
  	if (user_opts.transfer_mode == TRANSFER_MODE_LINK)
  		disable_old_cluster();
  
! 	transfer_all_new_tablespaces(&old_cluster.dbarr, &new_cluster.dbarr,
  						 old_cluster.pgdata, new_cluster.pgdata);
  
  	/*
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
new file mode 100644
index c1a2f53..d5c3fa9
*** a/contrib/pg_upgrade/pg_upgrade.h
--- b/contrib/pg_upgrade/pg_upgrade.h
*************** typedef struct
*** 134,141 ****
   */
  typedef struct
  {
! 	char		old_dir[MAXPGPATH];
! 	char		new_dir[MAXPGPATH];
  
  	/*
  	 * old/new relfilenodes might differ for pg_largeobject(_metadata) indexes
--- 134,145 ----
   */
  typedef struct
  {
! 	char		old_tablespace[MAXPGPATH];
! 	char		new_tablespace[MAXPGPATH];
! 	char		old_tablespace_suffix[MAXPGPATH];
! 	char		new_tablespace_suffix[MAXPGPATH];
! 	Oid			old_db_oid;
! 	Oid			new_db_oid;
  
  	/*
  	 * old/new relfilenodes might differ for pg_largeobject(_metadata) indexes
*************** typedef struct
*** 276,283 ****
  	const char *progname;		/* complete pathname for this program */
  	char	   *exec_path;		/* full path to my executable */
  	char	   *user;			/* username for clusters */
! 	char	  **tablespaces;	/* tablespaces */
! 	int			num_tablespaces;
  	char	  **libraries;		/* loadable libraries */
  	int			num_libraries;
  	ClusterInfo *running_cluster;
--- 280,287 ----
  	const char *progname;		/* complete pathname for this program */
  	char	   *exec_path;		/* full path to my executable */
  	char	   *user;			/* username for clusters */
! 	char	  **old_tablespaces;	/* tablespaces */
! 	int			num_old_tablespaces;
  	char	  **libraries;		/* loadable libraries */
  	int			num_libraries;
  	ClusterInfo *running_cluster;
*************** void		get_sock_dir(ClusterInfo *cluster,
*** 398,406 ****
  /* relfilenode.c */
  
  void		get_pg_database_relfilenode(ClusterInfo *cluster);
! void		transfer_all_new_dbs(DbInfoArr *olddb_arr,
! 				   DbInfoArr *newdb_arr, char *old_pgdata, char *new_pgdata);
! 
  
  /* tablespace.c */
  
--- 402,412 ----
  /* relfilenode.c */
  
  void		get_pg_database_relfilenode(ClusterInfo *cluster);
! void		transfer_all_new_tablespaces(DbInfoArr *old_db_arr,
! 				   DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata);
! void		transfer_all_new_dbs(DbInfoArr *old_db_arr,
! 				   DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata,
! 				   char *old_tablespace);
  
  /* tablespace.c */
  
*************** void old_8_3_invalidate_bpchar_pattern_o
*** 464,472 ****
  char	   *old_8_3_create_sequence_script(ClusterInfo *cluster);
  
  /* parallel.c */
! void parallel_exec_prog(const char *log_file, const char *opt_log_file,
  		  const char *fmt,...)
  __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
! 
! bool reap_child(bool wait_for_child);
  
--- 470,480 ----
  char	   *old_8_3_create_sequence_script(ClusterInfo *cluster);
  
  /* parallel.c */
! void		parallel_exec_prog(const char *log_file, const char *opt_log_file,
  		  const char *fmt,...)
  __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
! void		parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
! 										  char *old_pgdata, char *new_pgdata,
! 										  char *old_tablespace);
! bool		reap_child(bool wait_for_child);
  
diff --git a/contrib/pg_upgrade/relfilenode.c b/contrib/pg_upgrade/relfilenode.c
new file mode 100644
index 9d0d5a0..cfdd39c
*** a/contrib/pg_upgrade/relfilenode.c
--- b/contrib/pg_upgrade/relfilenode.c
***************
*** 16,42 ****
  
  
  static void transfer_single_new_db(pageCnvCtx *pageConverter,
! 					   FileNameMap *maps, int size);
  static void transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map,
  							 const char *suffix);
  
  
  /*
   * transfer_all_new_dbs()
   *
   * Responsible for upgrading all database. invokes routines to generate mappings and then
   * physically link the databases.
   */
  void
! transfer_all_new_dbs(DbInfoArr *old_db_arr,
! 				   DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata)
  {
  	int			old_dbnum,
  				new_dbnum;
  
- 	pg_log(PG_REPORT, "%s user relation files\n",
- 	  user_opts.transfer_mode == TRANSFER_MODE_LINK ? "Linking" : "Copying");
- 
  	/* Scan the old cluster databases and transfer their files */
  	for (old_dbnum = new_dbnum = 0;
  		 old_dbnum < old_db_arr->ndbs;
--- 16,81 ----
  
  
  static void transfer_single_new_db(pageCnvCtx *pageConverter,
! 					   FileNameMap *maps, int size, char *old_tablespace);
  static void transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map,
  							 const char *suffix);
  
  
  /*
+  * transfer_all_new_tablespaces()
+  *
+  * Responsible for upgrading all database. invokes routines to generate mappings and then
+  * physically link the databases.
+  */
+ void
+ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
+ 					char *old_pgdata, char *new_pgdata)
+ {
+ 	pg_log(PG_REPORT, "%s user relation files\n",
+ 	  user_opts.transfer_mode == TRANSFER_MODE_LINK ? "Linking" : "Copying");
+ 
+ 	/*
+ 	 *	Transfering files by tablespace is tricky because a single database
+ 	 *	can use multiple tablespaces.  For non-parallel mode, we just pass a
+ 	 *	NULL tablespace path, which matches all tablespaces.  In parallel mode,
+ 	 *	we pass the default tablespace and all user-created tablespaces
+ 	 *	and let those operations happen in parallel.
+ 	 */
+ 	if (user_opts.jobs <= 1)
+ 		parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata,
+ 									  new_pgdata, NULL);
+ 	else
+ 	{
+ 		int tblnum;
+ 		
+ 		for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++)
+ 			parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata,
+ 								  new_pgdata, os_info.old_tablespaces[tblnum]);
+ 		/* reap all children */
+ 		while (reap_child(true) == true)
+ 			;
+ 	}
+ 
+ 	end_progress_output();
+ 	check_ok();
+ 
+ 	return;
+ }
+ 
+ 
+ /*
   * transfer_all_new_dbs()
   *
   * Responsible for upgrading all database. invokes routines to generate mappings and then
   * physically link the databases.
   */
  void
! transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
! 					char *old_pgdata, char *new_pgdata, char *old_tablespace)
  {
  	int			old_dbnum,
  				new_dbnum;
  
  	/* Scan the old cluster databases and transfer their files */
  	for (old_dbnum = new_dbnum = 0;
  		 old_dbnum < old_db_arr->ndbs;
*************** transfer_all_new_dbs(DbInfoArr *old_db_a
*** 75,89 ****
  #ifdef PAGE_CONVERSION
  			pageConverter = setupPageConverter();
  #endif
! 			transfer_single_new_db(pageConverter, mappings, n_maps);
  
  			pg_free(mappings);
  		}
  	}
  
- 	end_progress_output();
- 	check_ok();
- 
  	return;
  }
  
--- 114,126 ----
  #ifdef PAGE_CONVERSION
  			pageConverter = setupPageConverter();
  #endif
! 			transfer_single_new_db(pageConverter, mappings, n_maps,
! 								   old_tablespace);
  
  			pg_free(mappings);
  		}
  	}
  
  	return;
  }
  
*************** get_pg_database_relfilenode(ClusterInfo
*** 125,131 ****
   */
  static void
  transfer_single_new_db(pageCnvCtx *pageConverter,
! 					   FileNameMap *maps, int size)
  {
  	int			mapnum;
  	bool		vm_crashsafe_match = true;
--- 162,168 ----
   */
  static void
  transfer_single_new_db(pageCnvCtx *pageConverter,
! 					   FileNameMap *maps, int size, char *old_tablespace)
  {
  	int			mapnum;
  	bool		vm_crashsafe_match = true;
*************** transfer_single_new_db(pageCnvCtx *pageC
*** 140,157 ****
  
  	for (mapnum = 0; mapnum < size; mapnum++)
  	{
! 		/* transfer primary file */
! 		transfer_relfile(pageConverter, &maps[mapnum], "");
! 
! 		/* fsm/vm files added in PG 8.4 */
! 		if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
  		{
! 			/*
! 			 * Copy/link any fsm and vm files, if they exist
! 			 */
! 			transfer_relfile(pageConverter, &maps[mapnum], "_fsm");
! 			if (vm_crashsafe_match)
! 				transfer_relfile(pageConverter, &maps[mapnum], "_vm");
  		}
  	}
  }
--- 177,198 ----
  
  	for (mapnum = 0; mapnum < size; mapnum++)
  	{
! 		if (old_tablespace == NULL ||
! 			strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0)
  		{
! 			/* transfer primary file */
! 			transfer_relfile(pageConverter, &maps[mapnum], "");
! 	
! 			/* fsm/vm files added in PG 8.4 */
! 			if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
! 			{
! 				/*
! 				 * Copy/link any fsm and vm files, if they exist
! 				 */
! 				transfer_relfile(pageConverter, &maps[mapnum], "_fsm");
! 				if (vm_crashsafe_match)
! 					transfer_relfile(pageConverter, &maps[mapnum], "_vm");
! 			}
  		}
  	}
  }
*************** transfer_relfile(pageCnvCtx *pageConvert
*** 187,196 ****
  		else
  			snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno);
  
! 		snprintf(old_file, sizeof(old_file), "%s/%u%s%s", map->old_dir,
! 				 map->old_relfilenode, type_suffix, extent_suffix);
! 		snprintf(new_file, sizeof(new_file), "%s/%u%s%s", map->new_dir,
! 				 map->new_relfilenode, type_suffix, extent_suffix);
  	
  		/* Is it an extent, fsm, or vm file? */
  		if (type_suffix[0] != '\0' || segno != 0)
--- 228,239 ----
  		else
  			snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno);
  
! 		snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s", map->old_tablespace,
! 				 map->old_tablespace_suffix, map->old_db_oid, map->old_relfilenode,
! 				 type_suffix, extent_suffix);
! 		snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s", map->new_tablespace,
! 				 map->new_tablespace_suffix, map->new_db_oid, map->new_relfilenode,
! 				 type_suffix, extent_suffix);
  	
  		/* Is it an extent, fsm, or vm file? */
  		if (type_suffix[0] != '\0' || segno != 0)
*************** transfer_relfile(pageCnvCtx *pageConvert
*** 239,241 ****
--- 282,285 ----
  
  	return;
  }
+ 
diff --git a/contrib/pg_upgrade/tablespace.c b/contrib/pg_upgrade/tablespace.c
new file mode 100644
index a93c517..321738d
*** a/contrib/pg_upgrade/tablespace.c
--- b/contrib/pg_upgrade/tablespace.c
*************** init_tablespaces(void)
*** 23,29 ****
  	set_tablespace_directory_suffix(&old_cluster);
  	set_tablespace_directory_suffix(&new_cluster);
  
! 	if (os_info.num_tablespaces > 0 &&
  	strcmp(old_cluster.tablespace_suffix, new_cluster.tablespace_suffix) == 0)
  		pg_log(PG_FATAL,
  			   "Cannot upgrade to/from the same system catalog version when\n"
--- 23,29 ----
  	set_tablespace_directory_suffix(&old_cluster);
  	set_tablespace_directory_suffix(&new_cluster);
  
! 	if (os_info.num_old_tablespaces > 0 &&
  	strcmp(old_cluster.tablespace_suffix, new_cluster.tablespace_suffix) == 0)
  		pg_log(PG_FATAL,
  			   "Cannot upgrade to/from the same system catalog version when\n"
*************** get_tablespace_paths(void)
*** 57,72 ****
  
  	res = executeQueryOrDie(conn, "%s", query);
  
! 	if ((os_info.num_tablespaces = PQntuples(res)) != 0)
! 		os_info.tablespaces = (char **) pg_malloc(
! 								   os_info.num_tablespaces * sizeof(char *));
  	else
! 		os_info.tablespaces = NULL;
  
  	i_spclocation = PQfnumber(res, "spclocation");
  
! 	for (tblnum = 0; tblnum < os_info.num_tablespaces; tblnum++)
! 		os_info.tablespaces[tblnum] = pg_strdup(
  									 PQgetvalue(res, tblnum, i_spclocation));
  
  	PQclear(res);
--- 57,72 ----
  
  	res = executeQueryOrDie(conn, "%s", query);
  
! 	if ((os_info.num_old_tablespaces = PQntuples(res)) != 0)
! 		os_info.old_tablespaces = (char **) pg_malloc(
! 								   os_info.num_old_tablespaces * sizeof(char *));
  	else
! 		os_info.old_tablespaces = NULL;
  
  	i_spclocation = PQfnumber(res, "spclocation");
  
! 	for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++)
! 		os_info.old_tablespaces[tblnum] = pg_strdup(
  									 PQgetvalue(res, tblnum, i_spclocation));
  
  	PQclear(res);
diff --git a/doc/src/sgml/pgupgrade.sgml b/doc/src/sgml/pgupgrade.sgml
new file mode 100644
index 53781e4..3e5d548
*** a/doc/src/sgml/pgupgrade.sgml
--- b/doc/src/sgml/pgupgrade.sgml
*************** NET STOP pgsql-8.3  (<productname>Postgr
*** 342,351 ****
  
      <para>
       The <option>--jobs</> option allows multiple CPU cores to be used
!      to dump and reload database schemas in parallel;  a good place to
!      start is the number of CPU cores on the server.  This option can
!      dramatically reduce the time to upgrade a multi-database server
!      running on a multiprocessor machine.
      </para>
  
      <para>
--- 342,353 ----
  
      <para>
       The <option>--jobs</> option allows multiple CPU cores to be used
!      for file copy operations and to dump and reload database schemas in
!      parallel;  a good place to start is the number of CPU cores on the
!      server, or the number of tablespaces if not using the
!      <option>--link</> option.  This option can dramatically reduce the
!      time to upgrade a multi-database server running on a multiprocessor
!      machine.
      </para>
  
      <para>

:

. traprm

export QUIET=$((QUIET + 1))

> /rtmp/out

OLDREL=9.2
NEWREL=9.3
BRANCH=tablespace
SSD="f"

export PGOPTIONS="-c synchronous_commit=off"

for CYCLES in 2 4 8 16 32 64 128
do
        echo "$CYCLES" >> /rtmp/out
        for JOBLIMIT in 1 2
        do
                cd /pgsql/$REL
                pgsw $BRANCH
                cd -
                tools/setup $OLDREL $NEWREL
                sleep 2
                [ "$SSD" = "f" ] && tools/mv_to_archive
        
                # need for +16k
                for CONFIG in /u/pgsql.old/data/postgresql.conf 
/u/pgsql/data/postgresql.conf
                do
                        pipe sed 's/#max_locks_per_transaction = 
64/max_locks_per_transaction = 64000/' "$CONFIG"
                        pipe sed 's/shared_buffers = 128MB/shared_buffers = 
1GB/' "$CONFIG"
                done

                pgstart -w /u/pgsql.old/data
                for DIR in /archive/tmp/t1 /backup0/tmp/t2
                do      rm -rf "$DIR"
                        mkdir "$DIR"
                        chown postgres "$DIR"
                        chmod 0700 "$DIR"
                done

                echo "CREATE TABLESPACE t1 LOCATION '/archive/tmp/t1';" | sql 
--echo-all test
                echo "CREATE TABLESPACE t2 LOCATION '/backup0/tmp/t2';" | sql 
--echo-all test

                for SPNO in $(jot 2)
                do
                        for TBLNO in $(jot $(($CYCLES / 2)) )
                        do 
                                echo "CREATE TABLE test${SPNO}_${TBLNO} (x 
TEXT) TABLESPACE t$SPNO;"
                                echo "ALTER TABLE test${SPNO}_${TBLNO} ALTER 
COLUMN x SET STORAGE EXTERNAL;"
                                echo "INSERT INTO test${SPNO}_${TBLNO} SELECT 
repeat('x', 999000000);"
                        done |
                        PGOPTIONS="-c synchronous_commit=off" sql 
--single-transaction --echo-all test
                done
                pgstop /u/pgsql.old/data
                sleep 2
                # clear cache
                echo 3 > /proc/sys/vm/drop_caches
                # allow system to repopulate
                sleep 15
                /usr/bin/time --output=/rtmp/out --append --format '%e' 
tools/upgrade -j $JOBLIMIT || exit 
                sleep 2
        done
done

bell

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] pg_upgrade with parallel tablespace copying

Reply via email to