On Fri, Sep 9, 2016 at 12:33 PM, Pavan Deolasee
<pavan.deola...@gmail.com> wrote:
>
>
> On Thu, Sep 8, 2016 at 11:40 PM, Masahiko Sawada <sawada.m...@gmail.com>
> wrote:
>>
>>
>>
>> Making the vacuum possible to choose between two data representations
>> sounds good.
>> I implemented the patch that changes dead tuple representation to bitmap
>> before.
>> I will measure the performance of bitmap representation again and post
>> them.
>
>
> Sounds great! I haven't seen your patch, but what I would suggest is to
> compute page density (D) = relpages/(dead+live tuples) and experiment with
> bitmap of sizes of D to 2D bits per page. May I also suggest that instead of
> putting in efforts in implementing the overflow area,  just count how many
> dead TIDs would fall under overflow area for a given choice of bitmap size.
>

Isn't that formula "page density (D) = (dead+live tuples)/relpages"?

> It might be a good idea to experiment with different vacuum scale factor,
> varying between 2% to 20% (may be 2, 5, 10, 20). You can probably run a
> longish pgbench test on a large table and then save the data directory for
> repeated experiments, although I'm not sure if pgbench will be a good choice
> because HOT will prevent accumulation of dead pointers, in which case you
> may try adding another index on abalance column.

Thank you, I will experiment with this.

>
> It'll be worth measuring memory consumption of both representations as well
> as performance implications on index vacuum. I don't expect to see any major
> difference in either heap scans.
>

Yeah, it would be effective for the index vacuum speed and the number
of execution of index vacuum.

Attached PoC patch changes the representation of dead tuple locations
to the hashmap having tuple bitmap.
The one hashmap entry consists of the block number and the TID bitmap
of corresponding block, and the block number is the hash key of
hashmap.
Current implementation of this patch is not smart yet because each
hashmap entry allocates the tuple bitmap with fixed
size(LAZY_ALLOC_TUPLES), so each hashentry can store up to
LAZY_ALLOC_TUPLES(291 if block size is 8kB) tuples.
In case where one block can store only the several tens tuples, the
most bits are would be waste.

After improved this patch as you suggested, I will measure performance benefit.

Regards,

--
Masahiko Sawada
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center
From d1968d625ca1bb07711681a2d824737c53d27c8c Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.m...@gmail.com>
Date: Thu, 8 Sep 2016 13:59:23 -0400
Subject: [PATCH] Collect dead tuple location as a bitmap.

---
 src/backend/commands/vacuumlazy.c | 232 +++++++++++++++++++++++++-------------
 1 file changed, 153 insertions(+), 79 deletions(-)

diff --git a/src/backend/commands/vacuumlazy.c 
b/src/backend/commands/vacuumlazy.c
index b5fb325..ce7bd7e 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -98,6 +98,28 @@
  */
 #define SKIP_PAGES_THRESHOLD   ((BlockNumber) 32)
 
+#define BITS_PER_BITMAP_CHUNK 32
+#define BITMAP_CHUNKS_PER_PAGE \
+       ((int) ((LAZY_ALLOC_TUPLES / BITS_PER_BITMAP_CHUNK) + 1))
+#define BitoffsetToOffsetNumber(chunk, offset) \
+       (((chunk) * BITS_PER_BITMAP_CHUNK) + (offset) + 1)
+#define OffsetNumberToChunk(offset) \
+       ((offset) / BITS_PER_BITMAP_CHUNK)
+#define OffsetNumberToBitoffset(offset) \
+       ((offset) % BITS_PER_BITMAP_CHUNK)
+
+typedef struct PageEntry
+{
+       BlockNumber     blockno;
+       uint32  bm[BITMAP_CHUNKS_PER_PAGE]; /* tuple bitmap of blockno */
+} PageEntry;
+
+typedef struct DeadTupleMap
+{
+       HTAB    *pagetable;
+       int             npages; /* # of blocks hashmap has */
+} DeadTupleMap;
+
 typedef struct LVRelStats
 {
        /* hasindex = true means two-pass strategy; false means one-pass */
@@ -120,6 +142,9 @@ typedef struct LVRelStats
        int                     num_dead_tuples;        /* current # of entries 
*/
        int                     max_dead_tuples;        /* # slots allocated in 
array */
        ItemPointer dead_tuples;        /* array of ItemPointerData */
+
+       DeadTupleMap *dead_tuple_map; /* hash map if dead ItemPointerData */
+
        int                     num_index_scans;
        TransactionId latestRemovedXid;
        bool            lock_waiter_detected;
@@ -148,20 +173,19 @@ static void lazy_vacuum_index(Relation indrel,
 static void lazy_cleanup_index(Relation indrel,
                                   IndexBulkDeleteResult *stats,
                                   LVRelStats *vacrelstats);
-static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
-                                int tupindex, LVRelStats *vacrelstats, Buffer 
*vmbuffer);
+static void lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
+                                PageEntry *entry, LVRelStats *vacrelstats, 
Buffer *vmbuffer);
 static bool should_attempt_truncation(LVRelStats *vacrelstats);
 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
 static BlockNumber count_nondeletable_pages(Relation onerel,
                                                 LVRelStats *vacrelstats);
 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
-static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
-                                          ItemPointer itemptr);
+static void lazy_record_dead_tuple(LVRelStats *vacrelstats, ItemPointer 
itemptr);
 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
-static int     vac_cmp_itemptr(const void *left, const void *right);
 static bool heap_page_is_all_visible(Relation rel, Buffer buf,
                                         TransactionId *visibility_cutoff_xid, 
bool *all_frozen);
 
+static void init_dead_tuple_map(DeadTupleMap *dead_tuple_map);
 
 /*
  *     lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
@@ -237,6 +261,7 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams 
*params,
                aggressive = true;
 
        vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
+       vacrelstats->dead_tuple_map = (DeadTupleMap *) 
palloc0(sizeof(DeadTupleMap));
 
        vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
        vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
@@ -427,6 +452,26 @@ vacuum_log_cleanup_info(Relation rel, LVRelStats 
*vacrelstats)
                (void) log_heap_cleanup_info(rel->rd_node, 
vacrelstats->latestRemovedXid);
 }
 
+static void
+init_dead_tuple_map(DeadTupleMap *dtmap)
+{
+       HASHCTL hash_ctl;
+
+       /* If already exists, destroy it in advance */
+       if (dtmap->pagetable)
+               hash_destroy(dtmap->pagetable);
+
+       MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+       hash_ctl.keysize = sizeof(BlockNumber);
+       hash_ctl.entrysize = sizeof(PageEntry);
+       hash_ctl.hcxt = CurrentMemoryContext;
+       dtmap->pagetable = hash_create("DeadTupleBitmap",
+                                                                               
        128,
+                                                                               
        &hash_ctl,
+                                                                               
        HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+       dtmap->npages = 0;
+}
+
 /*
  *     lazy_scan_heap() -- scan an open heap relation
  *
@@ -493,6 +538,8 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats 
*vacrelstats,
        vacrelstats->latestRemovedXid = InvalidTransactionId;
 
        lazy_space_alloc(vacrelstats, nblocks);
+       init_dead_tuple_map(vacrelstats->dead_tuple_map);
+
        frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
 
        /* Report that we're scanning the heap, advertising total # of blocks */
@@ -731,6 +778,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats 
*vacrelstats,
                         */
                        vacrelstats->num_dead_tuples = 0;
                        vacrelstats->num_index_scans++;
+                       init_dead_tuple_map(vacrelstats->dead_tuple_map);
 
                        /* Report that we are once again scanning the heap */
                        pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
@@ -1125,8 +1173,18 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats 
*vacrelstats,
                if (nindexes == 0 &&
                        vacrelstats->num_dead_tuples > 0)
                {
+                       PageEntry *entry = NULL;
+                       bool found;
+
+                       /* Search current block from hash table, must be found 
*/
+                       entry = (PageEntry *) 
hash_search(vacrelstats->dead_tuple_map->pagetable,
+                                                                               
         (void *) &blkno,
+                                                                               
         HASH_FIND, &found);
+
+                       Assert(found);
+
                        /* Remove tuples from heap */
-                       lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, 
&vmbuffer);
+                       lazy_vacuum_page(onerel, blkno, buf, entry, 
vacrelstats, &vmbuffer);
                        has_dead_tuples = false;
 
                        /*
@@ -1134,6 +1192,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats 
*vacrelstats,
                         * not to reset latestRemovedXid since we want that 
value to be
                         * valid.
                         */
+                       init_dead_tuple_map(vacrelstats->dead_tuple_map);
                        vacrelstats->num_dead_tuples = 0;
                        vacuumed_pages++;
                }
@@ -1367,11 +1426,15 @@ lazy_vacuum_heap(Relation onerel, LVRelStats 
*vacrelstats)
        PGRUsage        ru0;
        Buffer          vmbuffer = InvalidBuffer;
 
+       DeadTupleMap    *dtmap = vacrelstats->dead_tuple_map;
+       HASH_SEQ_STATUS status;
+       PageEntry       *entry;
+
        pg_rusage_init(&ru0);
        npages = 0;
 
-       tupindex = 0;
-       while (tupindex < vacrelstats->num_dead_tuples)
+       hash_seq_init(&status, dtmap->pagetable);
+       while ((entry = (PageEntry *) hash_seq_search(&status)) != NULL)
        {
                BlockNumber tblk;
                Buffer          buf;
@@ -1380,7 +1443,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 
                vacuum_delay_point();
 
-               tblk = 
ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
+               tblk = entry->blockno;
                buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
                                                                 vac_strategy);
                if (!ConditionalLockBufferForCleanup(buf))
@@ -1389,8 +1452,8 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
                        ++tupindex;
                        continue;
                }
-               tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, 
vacrelstats,
-                                                                       
&vmbuffer);
+               lazy_vacuum_page(onerel, tblk, buf, entry, vacrelstats,
+                                                &vmbuffer);
 
                /* Now that we've compacted the page, record its available 
space */
                page = BufferGetPage(buf);
@@ -1425,33 +1488,43 @@ lazy_vacuum_heap(Relation onerel, LVRelStats 
*vacrelstats)
  * tuple for this page.  We assume the rest follow sequentially.
  * The return value is the first tupindex after the tuples of this page.
  */
-static int
+static void
 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
-                                int tupindex, LVRelStats *vacrelstats, Buffer 
*vmbuffer)
+                                PageEntry *entry, LVRelStats *vacrelstats, 
Buffer *vmbuffer)
 {
        Page            page = BufferGetPage(buffer);
        OffsetNumber unused[MaxOffsetNumber];
        int                     uncnt = 0;
        TransactionId visibility_cutoff_xid;
        bool            all_frozen;
+       int                     chunk, off;
 
        pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
 
        START_CRIT_SECTION();
 
-       for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
+       for (chunk = 0; chunk < BITMAP_CHUNKS_PER_PAGE; chunk++)
        {
-               BlockNumber tblk;
                OffsetNumber toff;
                ItemId          itemid;
 
-               tblk = 
ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
-               if (tblk != blkno)
-                       break;                          /* past end of tuples 
for this block */
-               toff = 
ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
-               itemid = PageGetItemId(page, toff);
-               ItemIdSetUnused(itemid);
-               unused[uncnt++] = toff;
+               if (!(entry->bm[chunk]))
+                       continue;
+
+               for (off = 0; off < BITS_PER_BITMAP_CHUNK; off++)
+               {
+                       if (entry->bm[chunk] < ((uint32) 1 << off))
+                               break;
+
+                       /* Found the offset number that bit is set */
+                       if (entry->bm[chunk] & ((uint32) 1 << off))
+                       {
+                               toff = BitoffsetToOffsetNumber(chunk, off);
+                               itemid = PageGetItemId(page, toff);
+                               ItemIdSetUnused(itemid);
+                               unused[uncnt++] = toff;
+                       }
+               }
        }
 
        PageRepairFragmentation(page);
@@ -1512,8 +1585,6 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, 
Buffer buffer,
                        visibilitymap_set(onerel, blkno, buffer, 
InvalidXLogRecPtr,
                                                          *vmbuffer, 
visibility_cutoff_xid, flags);
        }
-
-       return tupindex;
 }
 
 /*
@@ -1947,12 +2018,24 @@ lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber 
relblocks)
        int                     vac_work_mem = IsAutoVacuumWorkerProcess() &&
        autovacuum_work_mem != -1 ?
        autovacuum_work_mem : maintenance_work_mem;
+       HASHCTL hash_ctl;
+
+       MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+       hash_ctl.keysize = sizeof(BlockNumber);
+       hash_ctl.entrysize = sizeof(PageEntry);
+       hash_ctl.hcxt = CurrentMemoryContext;
+       vacrelstats->dead_tuple_map->pagetable = hash_create("DeadTupleBitmap",
+                                                                               
                                 128,
+                                                                               
                                 &hash_ctl,
+                                                                               
                                 HASH_ELEM | HASH_BLOBS |
+                                                                               
                                 HASH_CONTEXT);
+       vacrelstats->dead_tuple_map->npages = 0;
 
        if (vacrelstats->hasindex)
        {
-               maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
+               maxtuples = ((vac_work_mem * 1024L) / sizeof(PageEntry)) * 
LAZY_ALLOC_TUPLES;
                maxtuples = Min(maxtuples, INT_MAX);
-               maxtuples = Min(maxtuples, MaxAllocSize / 
sizeof(ItemPointerData));
+               maxtuples = Min(maxtuples, (MaxAllocSize / sizeof(PageEntry)) * 
LAZY_ALLOC_TUPLES);
 
                /* curious coding here to ensure the multiplication can't 
overflow */
                if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
@@ -1968,29 +2051,41 @@ lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber 
relblocks)
 
        vacrelstats->num_dead_tuples = 0;
        vacrelstats->max_dead_tuples = (int) maxtuples;
-       vacrelstats->dead_tuples = (ItemPointer)
-               palloc(maxtuples * sizeof(ItemPointerData));
 }
 
 /*
  * lazy_record_dead_tuple - remember one deletable tuple
  */
 static void
-lazy_record_dead_tuple(LVRelStats *vacrelstats,
-                                          ItemPointer itemptr)
+lazy_record_dead_tuple(LVRelStats *vacrelstats, ItemPointer itemptr)
 {
-       /*
-        * The array shouldn't overflow under normal behavior, but perhaps it
-        * could if we are given a really small maintenance_work_mem. In that
-        * case, just forget the last few tuples (we'll get 'em next time).
-        */
-       if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
+       DeadTupleMap *dtmap = vacrelstats->dead_tuple_map;
+       PageEntry *page = NULL;
+       BlockNumber             blkno = ItemPointerGetBlockNumber(itemptr);
+       OffsetNumber    offset = ItemPointerGetOffsetNumber(itemptr);
+       bool    found;
+       int             chunk, bitoffset;
+
+       page = (PageEntry *) hash_search(dtmap->pagetable,
+                                                                        (void 
*) &blkno,
+                                                                        
HASH_ENTER, &found);
+
+       /* Initialize it if not present before */
+       if (!found)
        {
-               vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = 
*itemptr;
-               vacrelstats->num_dead_tuples++;
-               pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
-                                                                        
vacrelstats->num_dead_tuples);
+               MemSet(page, 0, sizeof(PageEntry));
+               page->blockno = blkno;
+               dtmap->npages++;
        }
+
+       chunk = OffsetNumberToChunk(offset - 1);
+       bitoffset = OffsetNumberToBitoffset(offset - 1);
+
+       page->bm[chunk] |= ((uint32) 1 << bitoffset);
+
+       vacrelstats->num_dead_tuples++;
+       pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
+                                                                
vacrelstats->num_dead_tuples);
 }
 
 /*
@@ -2004,45 +2099,24 @@ static bool
 lazy_tid_reaped(ItemPointer itemptr, void *state)
 {
        LVRelStats *vacrelstats = (LVRelStats *) state;
-       ItemPointer res;
-
-       res = (ItemPointer) bsearch((void *) itemptr,
-                                                               (void *) 
vacrelstats->dead_tuples,
-                                                               
vacrelstats->num_dead_tuples,
-                                                               
sizeof(ItemPointerData),
-                                                               
vac_cmp_itemptr);
-
-       return (res != NULL);
-}
-
-/*
- * Comparator routines for use with qsort() and bsearch().
- */
-static int
-vac_cmp_itemptr(const void *left, const void *right)
-{
-       BlockNumber lblk,
-                               rblk;
-       OffsetNumber loff,
-                               roff;
-
-       lblk = ItemPointerGetBlockNumber((ItemPointer) left);
-       rblk = ItemPointerGetBlockNumber((ItemPointer) right);
-
-       if (lblk < rblk)
-               return -1;
-       if (lblk > rblk)
-               return 1;
-
-       loff = ItemPointerGetOffsetNumber((ItemPointer) left);
-       roff = ItemPointerGetOffsetNumber((ItemPointer) right);
-
-       if (loff < roff)
-               return -1;
-       if (loff > roff)
-               return 1;
-
-       return 0;
+       PageEntry       *entry;
+       BlockNumber     blkno = ItemPointerGetBlockNumber(itemptr);
+       OffsetNumber offset = ItemPointerGetOffsetNumber(itemptr);
+       bool    found;
+       bool    ret;
+       int     chunk, bitoffset;
+
+       entry = (PageEntry *) 
hash_search(vacrelstats->dead_tuple_map->pagetable,
+                                                                        (void 
*) &blkno,
+                                                                        
HASH_FIND, &found);
+       if (!found)
+               return false;   /* quick exit */
+
+       chunk = OffsetNumberToChunk(offset - 1);
+       bitoffset = OffsetNumberToBitoffset(offset - 1);
+
+       ret = entry->bm[chunk] & ((uint32) 1 << bitoffset);
+       return ret;
 }
 
 /*
-- 
2.8.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to