Current version of postgres support only 1GB chunks. This limit is defined in the pg_config_manual.h header file. However this setting allows to have maximal 2GB chunks. Main problem is that md storage manager and buffile use "long" data type (32bits) for offset instead "off_t" defined in <sys/types.h>.

off_t is 32bits long on 32bits OS and 64bits long on 64bits OS or when application is compiled with large file support.

Attached patch allow to setup bigger chunks than 4GB on OS with large file support.

I tested it on 7GB table and it works.


Please, look on it and let me know your comments or if I miss something.


TODO/questions:

1) clean/update comments about limitation

2) Is there some doc for update?

3) I would like to add some check compare sizeof(off_t) and chunk size setting and protect postgres with missconfigured chunk size. Is mdinit() good place for this check?

4) I'm going to take bigger machine for test with really big table.


        with regards Zdenek


Index: src/backend/storage/file/buffile.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/file/buffile.c,v
retrieving revision 1.25
diff -c -r1.25 buffile.c
*** src/backend/storage/file/buffile.c	5 Jan 2007 22:19:37 -0000	1.25
--- src/backend/storage/file/buffile.c	6 Apr 2007 12:08:47 -0000
***************
*** 42,48 ****
   * Note we adhere to this limit whether or not LET_OS_MANAGE_FILESIZE
   * is defined, although md.c ignores it when that symbol is defined.
   */
! #define MAX_PHYSICAL_FILESIZE  (RELSEG_SIZE * BLCKSZ)
  
  /*
   * This data structure represents a buffered file that consists of one or
--- 42,48 ----
   * Note we adhere to this limit whether or not LET_OS_MANAGE_FILESIZE
   * is defined, although md.c ignores it when that symbol is defined.
   */
! #define MAX_PHYSICAL_FILESIZE  ((off_t)RELSEG_SIZE * BLCKSZ)
  
  /*
   * This data structure represents a buffered file that consists of one or
***************
*** 54,60 ****
  	int			numFiles;		/* number of physical files in set */
  	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
  	File	   *files;			/* palloc'd array with numFiles entries */
! 	long	   *offsets;		/* palloc'd array with numFiles entries */
  
  	/*
  	 * offsets[i] is the current seek position of files[i].  We use this to
--- 54,60 ----
  	int			numFiles;		/* number of physical files in set */
  	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
  	File	   *files;			/* palloc'd array with numFiles entries */
! 	off_t	   *offsets;		/* palloc'd array with numFiles entries */
  
  	/*
  	 * offsets[i] is the current seek position of files[i].  We use this to
***************
*** 70,76 ****
  	 * Position as seen by user of BufFile is (curFile, curOffset + pos).
  	 */
  	int			curFile;		/* file index (0..n) part of current pos */
! 	int			curOffset;		/* offset part of current pos */
  	int			pos;			/* next read/write position in buffer */
  	int			nbytes;			/* total # of valid bytes in buffer */
  	char		buffer[BLCKSZ];
--- 70,76 ----
  	 * Position as seen by user of BufFile is (curFile, curOffset + pos).
  	 */
  	int			curFile;		/* file index (0..n) part of current pos */
! 	off_t		curOffset;		/* offset part of current pos */
  	int			pos;			/* next read/write position in buffer */
  	int			nbytes;			/* total # of valid bytes in buffer */
  	char		buffer[BLCKSZ];
***************
*** 95,101 ****
  	file->numFiles = 1;
  	file->files = (File *) palloc(sizeof(File));
  	file->files[0] = firstfile;
! 	file->offsets = (long *) palloc(sizeof(long));
  	file->offsets[0] = 0L;
  	file->isTemp = false;
  	file->dirty = false;
--- 95,101 ----
  	file->numFiles = 1;
  	file->files = (File *) palloc(sizeof(File));
  	file->files[0] = firstfile;
! 	file->offsets = (off_t *) palloc(sizeof(off_t));
  	file->offsets[0] = 0L;
  	file->isTemp = false;
  	file->dirty = false;
***************
*** 121,128 ****
  
  	file->files = (File *) repalloc(file->files,
  									(file->numFiles + 1) * sizeof(File));
! 	file->offsets = (long *) repalloc(file->offsets,
! 									  (file->numFiles + 1) * sizeof(long));
  	file->files[file->numFiles] = pfile;
  	file->offsets[file->numFiles] = 0L;
  	file->numFiles++;
--- 121,128 ----
  
  	file->files = (File *) repalloc(file->files,
  									(file->numFiles + 1) * sizeof(File));
! 	file->offsets = (off_t *) repalloc(file->offsets,
! 									  (file->numFiles + 1) * sizeof(off_t));
  	file->files[file->numFiles] = pfile;
  	file->offsets[file->numFiles] = 0L;
  	file->numFiles++;
***************
*** 273,281 ****
  		bytestowrite = file->nbytes - wpos;
  		if (file->isTemp)
  		{
! 			long		availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
  
! 			if ((long) bytestowrite > availbytes)
  				bytestowrite = (int) availbytes;
  		}
  
--- 273,281 ----
  		bytestowrite = file->nbytes - wpos;
  		if (file->isTemp)
  		{
! 			off_t		availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
  
! 			if ((off_t) bytestowrite > availbytes)
  				bytestowrite = (int) availbytes;
  		}
  
***************
*** 445,454 ****
   * impossible seek is attempted.
   */
  int
! BufFileSeek(BufFile *file, int fileno, long offset, int whence)
  {
  	int			newFile;
! 	long		newOffset;
  
  	switch (whence)
  	{
--- 445,454 ----
   * impossible seek is attempted.
   */
  int
! BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
  {
  	int			newFile;
! 	off_t		newOffset;
  
  	switch (whence)
  	{
***************
*** 531,537 ****
  }
  
  void
! BufFileTell(BufFile *file, int *fileno, long *offset)
  {
  	*fileno = file->curFile;
  	*offset = file->curOffset + file->pos;
--- 530,536 ----
  }
  
  void
! BufFileTell(BufFile *file, int *fileno, off_t *offset)
  {
  	*fileno = file->curFile;
  	*offset = file->curOffset + file->pos;
***************
*** 544,559 ****
   * the file.  Note that users of this interface will fail if their files
   * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
   * with tables bigger than that, either...
   *
   * Result is 0 if OK, EOF if not.  Logical position is not moved if an
   * impossible seek is attempted.
   */
  int
! BufFileSeekBlock(BufFile *file, long blknum)
  {
  	return BufFileSeek(file,
  					   (int) (blknum / RELSEG_SIZE),
! 					   (blknum % RELSEG_SIZE) * BLCKSZ,
  					   SEEK_SET);
  }
  
--- 543,558 ----
   * the file.  Note that users of this interface will fail if their files
   * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
   * with tables bigger than that, either...
   *
   * Result is 0 if OK, EOF if not.  Logical position is not moved if an
   * impossible seek is attempted.
   */
  int
! BufFileSeekBlock(BufFile *file, BlockNumber blknum)
  {
  	return BufFileSeek(file,
  					   (int) (blknum / RELSEG_SIZE),
! 					   ((off_t)blknum % RELSEG_SIZE) * BLCKSZ,
  					   SEEK_SET);
  }
  
Index: src/backend/storage/file/fd.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/file/fd.c,v
retrieving revision 1.137
diff -c -r1.137 fd.c
*** src/backend/storage/file/fd.c	6 Mar 2007 02:06:14 -0000	1.137
--- src/backend/storage/file/fd.c	6 Apr 2007 12:08:47 -0000
***************
*** 128,134 ****
  	File		nextFree;		/* link to next free VFD, if in freelist */
  	File		lruMoreRecently;	/* doubly linked recency-of-use list */
  	File		lruLessRecently;
! 	long		seekPos;		/* current logical file position */
  	char	   *fileName;		/* name of file, or NULL for unused VFD */
  	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
  	int			fileFlags;		/* open(2) flags for (re)opening the file */
--- 128,134 ----
  	File		nextFree;		/* link to next free VFD, if in freelist */
  	File		lruMoreRecently;	/* doubly linked recency-of-use list */
  	File		lruLessRecently;
! 	off_t		seekPos;		/* current logical file position */
  	char	   *fileName;		/* name of file, or NULL for unused VFD */
  	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
  	int			fileFlags;		/* open(2) flags for (re)opening the file */
***************
*** 1136,1143 ****
  	return pg_fsync(VfdCache[file].fd);
  }
  
! long
! FileSeek(File file, long offset, int whence)
  {
  	int			returnCode;
  
--- 1136,1143 ----
  	return pg_fsync(VfdCache[file].fd);
  }
  
! off_t
! FileSeek(File file, off_t offset, int whence)
  {
  	int			returnCode;
  
***************
*** 1203,1209 ****
   * XXX not actually used but here for completeness
   */
  #ifdef NOT_USED
! long
  FileTell(File file)
  {
  	Assert(FileIsValid(file));
--- 1203,1209 ----
   * XXX not actually used but here for completeness
   */
  #ifdef NOT_USED
! off_t
  FileTell(File file)
  {
  	Assert(FileIsValid(file));
***************
*** 1214,1220 ****
  #endif
  
  int
! FileTruncate(File file, long offset)
  {
  	int			returnCode;
  
--- 1214,1220 ----
  #endif
  
  int
! FileTruncate(File file, off_t offset)
  {
  	int			returnCode;
  
***************
*** 1227,1233 ****
  	if (returnCode < 0)
  		return returnCode;
  
! 	returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
  	return returnCode;
  }
  
--- 1227,1233 ----
  	if (returnCode < 0)
  		return returnCode;
  
! 	returnCode = ftruncate(VfdCache[file].fd, (off_t) offset);
  	return returnCode;
  }
  
Index: src/backend/storage/smgr/md.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/smgr/md.c,v
retrieving revision 1.127
diff -c -r1.127 md.c
*** src/backend/storage/smgr/md.c	17 Jan 2007 16:25:01 -0000	1.127
--- src/backend/storage/smgr/md.c	6 Apr 2007 12:08:48 -0000
***************
*** 325,331 ****
  void
  mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
  {
! 	long		seekpos;
  	int			nbytes;
  	MdfdVec    *v;
  
--- 325,331 ----
  void
  mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
  {
! 	off_t		seekpos;
  	int			nbytes;
  	MdfdVec    *v;
  
***************
*** 351,360 ****
  	v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
  
  #ifndef LET_OS_MANAGE_FILESIZE
! 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
! 	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
  #else
! 	seekpos = (long) (BLCKSZ * (blocknum));
  #endif
  
  	/*
--- 351,360 ----
  	v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
  
  #ifndef LET_OS_MANAGE_FILESIZE
! 	seekpos =  (off_t)BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
! 	Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE);
  #else
! 	seekpos =  (off_t)BLCKSZ * blocknum;
  #endif
  
  	/*
***************
*** 507,523 ****
  void
  mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
  {
! 	long		seekpos;
  	int			nbytes;
  	MdfdVec    *v;
  
  	v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
  
  #ifndef LET_OS_MANAGE_FILESIZE
! 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
! 	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
  #else
! 	seekpos = (long) (BLCKSZ * (blocknum));
  #endif
  
  	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
--- 507,523 ----
  void
  mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
  {
! 	off_t		seekpos;
  	int			nbytes;
  	MdfdVec    *v;
  
  	v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
  
  #ifndef LET_OS_MANAGE_FILESIZE
! 	seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
! 	Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE);
  #else
! 	seekpos = (off_t)BLCKSZ * (blocknum);
  #endif
  
  	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
***************
*** 571,577 ****
  void
  mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
  {
! 	long		seekpos;
  	int			nbytes;
  	MdfdVec    *v;
  
--- 571,577 ----
  void
  mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
  {
! 	off_t		seekpos;
  	int			nbytes;
  	MdfdVec    *v;
  
***************
*** 583,592 ****
  	v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
  
  #ifndef LET_OS_MANAGE_FILESIZE
! 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
! 	Assert(seekpos < BLCKSZ * RELSEG_SIZE);
  #else
! 	seekpos = (long) (BLCKSZ * (blocknum));
  #endif
  
  	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
--- 583,592 ----
  	v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
  
  #ifndef LET_OS_MANAGE_FILESIZE
! 	seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
! 	Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE);
  #else
! 	seekpos = (off_t)BLCKSZ * (blocknum);
  #endif
  
  	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
***************
*** 1297,1303 ****
  static BlockNumber
  _mdnblocks(SMgrRelation reln, MdfdVec *seg)
  {
! 	long		len;
  
  	len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
  	if (len < 0)
--- 1297,1303 ----
  static BlockNumber
  _mdnblocks(SMgrRelation reln, MdfdVec *seg)
  {
! 	off_t		len;
  
  	len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
  	if (len < 0)
Index: src/backend/utils/sort/tuplestore.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/sort/tuplestore.c,v
retrieving revision 1.30
diff -c -r1.30 tuplestore.c
*** src/backend/utils/sort/tuplestore.c	5 Jan 2007 22:19:47 -0000	1.30
--- src/backend/utils/sort/tuplestore.c	6 Apr 2007 12:08:49 -0000
***************
*** 130,143 ****
  	bool		eof_reached;	/* read reached EOF (always valid) */
  	int			current;		/* next array index (valid if INMEM) */
  	int			readpos_file;	/* file# (valid if WRITEFILE and not eof) */
! 	long		readpos_offset; /* offset (valid if WRITEFILE and not eof) */
  	int			writepos_file;	/* file# (valid if READFILE) */
! 	long		writepos_offset;	/* offset (valid if READFILE) */
  
  	/* markpos_xxx holds marked position for mark and restore */
  	int			markpos_current;	/* saved "current" */
  	int			markpos_file;	/* saved "readpos_file" */
! 	long		markpos_offset; /* saved "readpos_offset" */
  };
  
  #define COPYTUP(state,tup)	((*(state)->copytup) (state, tup))
--- 130,143 ----
  	bool		eof_reached;	/* read reached EOF (always valid) */
  	int			current;		/* next array index (valid if INMEM) */
  	int			readpos_file;	/* file# (valid if WRITEFILE and not eof) */
! 	off_t		readpos_offset; /* offset (valid if WRITEFILE and not eof) */
  	int			writepos_file;	/* file# (valid if READFILE) */
! 	off_t		writepos_offset;	/* offset (valid if READFILE) */
  
  	/* markpos_xxx holds marked position for mark and restore */
  	int			markpos_current;	/* saved "current" */
  	int			markpos_file;	/* saved "readpos_file" */
! 	off_t		markpos_offset; /* saved "readpos_offset" */
  };
  
  #define COPYTUP(state,tup)	((*(state)->copytup) (state, tup))
Index: src/include/storage/buffile.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/buffile.h,v
retrieving revision 1.20
diff -c -r1.20 buffile.h
*** src/include/storage/buffile.h	5 Jan 2007 22:19:57 -0000	1.20
--- src/include/storage/buffile.h	6 Apr 2007 12:08:49 -0000
***************
*** 26,31 ****
--- 26,34 ----
  #ifndef BUFFILE_H
  #define BUFFILE_H
  
+ #include <sys/types.h>
+ #include "block.h"
+ 
  /* BufFile is an opaque type whose details are not known outside buffile.c. */
  
  typedef struct BufFile BufFile;
***************
*** 38,45 ****
  extern void BufFileClose(BufFile *file);
  extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
  extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
! extern int	BufFileSeek(BufFile *file, int fileno, long offset, int whence);
! extern void BufFileTell(BufFile *file, int *fileno, long *offset);
! extern int	BufFileSeekBlock(BufFile *file, long blknum);
  
  #endif   /* BUFFILE_H */
--- 41,48 ----
  extern void BufFileClose(BufFile *file);
  extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
  extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
! extern int	BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
! extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
! extern int	BufFileSeekBlock(BufFile *file, BlockNumber blknum);
  
  #endif   /* BUFFILE_H */
Index: src/include/storage/fd.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/fd.h,v
retrieving revision 1.57
diff -c -r1.57 fd.h
*** src/include/storage/fd.h	5 Jan 2007 22:19:57 -0000	1.57
--- src/include/storage/fd.h	6 Apr 2007 12:08:50 -0000
***************
*** 67,74 ****
  extern int	FileRead(File file, char *buffer, int amount);
  extern int	FileWrite(File file, char *buffer, int amount);
  extern int	FileSync(File file);
! extern long FileSeek(File file, long offset, int whence);
! extern int	FileTruncate(File file, long offset);
  
  /* Operations that allow use of regular stdio --- USE WITH CAUTION */
  extern FILE *AllocateFile(const char *name, const char *mode);
--- 67,74 ----
  extern int	FileRead(File file, char *buffer, int amount);
  extern int	FileWrite(File file, char *buffer, int amount);
  extern int	FileSync(File file);
! extern off_t FileSeek(File file, off_t offset, int whence);
! extern int	FileTruncate(File file, off_t offset);
  
  /* Operations that allow use of regular stdio --- USE WITH CAUTION */
  extern FILE *AllocateFile(const char *name, const char *mode);
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend

Reply via email to