Re: Track IO times in pg_stat_io

Andres Freund Tue, 07 Mar 2023 10:39:46 -0800

Hi,

On 2023-03-06 11:30:13 -0500, Melanie Plageman wrote:
> > As pgstat_bktype_io_stats_valid() is called only in Assert(), I think that 
> > would be a good idea
> > to also check that if counts are not Zero then times are not Zero.
> 
> Yes, I think adding some validation around the relationship between
> counts and timing should help prevent developers from forgetting to call
> pg_stat_count_io_op() when calling pgstat_count_io_time() (as relevant).
> 
> However, I think that we cannot check that if IO counts are non-zero
> that IO times are non-zero, because the user may not have
> track_io_timing enabled. We can check that if IO times are not zero, IO
> counts are not zero. I've done this in the attached v3.


And even if track_io_timing is enabled, the timer granularity might be so low
that we *still* get zeroes.

I wonder if we should get rid of pgStatBlockReadTime, pgStatBlockWriteTime,


> @@ -1000,11 +1000,27 @@ ReadBuffer_common(SMgrRelation smgr, char 
> relpersistence, ForkNumber forkNum,
>  
>       if (isExtend)
>       {
> +             instr_time      io_start,
> +                                     io_time;
> +
>               /* new buffers are zero-filled */
>               MemSet((char *) bufBlock, 0, BLCKSZ);
> +
> +             if (track_io_timing)
> +                     INSTR_TIME_SET_CURRENT(io_start);
> +             else
> +                     INSTR_TIME_SET_ZERO(io_start);
> +

I wonder if there's an argument for tracking this in the existing IO stats as
well. But I guess we've lived with this for a long time...


> @@ -2981,16 +2998,16 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, 
> IOObject io_object,
>        * When a strategy is not in use, the write can only be a "regular" 
> write
>        * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
>        */
> -     pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE);
> -
>       if (track_io_timing)
>       {
>               INSTR_TIME_SET_CURRENT(io_time);
>               INSTR_TIME_SUBTRACT(io_time, io_start);
>               
> pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
>               INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
> +             pgstat_count_io_time(IOOBJECT_RELATION, io_context, IOOP_WRITE, 
> io_time);
>       }

I think this needs a bit of cleanup - pgstat_count_buffer_write_time(),
pgBufferUsage.blk_write_time++, pgstat_count_io_time() is a bit excessive. We
might not be able to reduce the whole duplication at this point, but at least
it should be a bit more centralized.



> +     pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE);
>       pgBufferUsage.shared_blks_written++;
>  
>       /*
> @@ -3594,6 +3611,9 @@ FlushRelationBuffers(Relation rel)
>  
>       if (RelationUsesLocalBuffers(rel))
>       {
> +             instr_time      io_start,
> +                                     io_time;
> +
>               for (i = 0; i < NLocBuffer; i++)
>               {
>                       uint32          buf_state;
> @@ -3616,6 +3636,11 @@ FlushRelationBuffers(Relation rel)
>  
>                               PageSetChecksumInplace(localpage, 
> bufHdr->tag.blockNum);
>  
> +                             if (track_io_timing)
> +                                     INSTR_TIME_SET_CURRENT(io_start);
> +                             else
> +                                     INSTR_TIME_SET_ZERO(io_start);
> +
>                               smgrwrite(RelationGetSmgr(rel),
>                                                 
> BufTagGetForkNum(&bufHdr->tag),
>                                                 bufHdr->tag.blockNum,

I don't think you need the INSTR_TIME_SET_ZERO() in the body of the loop, to
silence the compiler warnings you can do it one level up.



> @@ -228,6 +230,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, 
> BlockNumber blockNum,
>  
>               PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
>  
> +             if (track_io_timing)
> +                     INSTR_TIME_SET_CURRENT(io_start);
> +             else
> +                     INSTR_TIME_SET_ZERO(io_start);
> +
>               /* And write... */
>               smgrwrite(oreln,
>                                 BufTagGetForkNum(&bufHdr->tag),
> @@ -239,6 +246,13 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, 
> BlockNumber blockNum,
>               buf_state &= ~BM_DIRTY;
>               pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
>  
> +             if (track_io_timing)
> +             {
> +                     INSTR_TIME_SET_CURRENT(io_time);
> +                     INSTR_TIME_SUBTRACT(io_time, io_start);
> +                     pgstat_count_io_time(IOOBJECT_TEMP_RELATION, 
> IOCONTEXT_NORMAL, IOOP_WRITE, io_time);
> +             }
> +
>               pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, 
> IOOP_WRITE);
>               pgBufferUsage.local_blks_written++;
>       }

Perhaps we can instead introduce a FlushLocalBuffer()? Then we don't need this
in multiple write paths.


> diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
> index 352958e1fe..052875d86a 100644
> --- a/src/backend/storage/smgr/md.c
> +++ b/src/backend/storage/smgr/md.c
> @@ -1030,6 +1030,30 @@ register_dirty_segment(SMgrRelation reln, ForkNumber 
> forknum, MdfdVec *seg)
>  
>       if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
>       {
> +             instr_time      io_start,
> +                                     io_time;
> +
> +             if (track_io_timing)
> +                     INSTR_TIME_SET_CURRENT(io_start);
> +             else
> +                     INSTR_TIME_SET_ZERO(io_start);
> +
> +             ereport(DEBUG1,
> +                             (errmsg_internal("could not forward fsync 
> request because request queue is full")));
> +
> +             if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
> +                     ereport(data_sync_elevel(ERROR),
> +                                     (errcode_for_file_access(),
> +                                      errmsg("could not fsync file \"%s\": 
> %m",
> +                                                     
> FilePathName(seg->mdfd_vfd))));
> +
> +             if (track_io_timing)
> +             {
> +                     INSTR_TIME_SET_CURRENT(io_time);
> +                     INSTR_TIME_SUBTRACT(io_time, io_start);
> +                     pgstat_count_io_time(IOOBJECT_RELATION, 
> IOCONTEXT_NORMAL, IOOP_FSYNC, io_time);
> +             }
> +
>               /*
>                * We have no way of knowing if the current IOContext is
>                * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] 
> at this
> @@ -1042,15 +1066,6 @@ register_dirty_segment(SMgrRelation reln, ForkNumber 
> forknum, MdfdVec *seg)
>                * backend fsyncs.
>                */
>               pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, 
> IOOP_FSYNC);
> -
> -             ereport(DEBUG1,
> -                             (errmsg_internal("could not forward fsync 
> request because request queue is full")));
> -
> -             if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
> -                     ereport(data_sync_elevel(ERROR),
> -                                     (errcode_for_file_access(),
> -                                      errmsg("could not fsync file \"%s\": 
> %m",
> -                                                     
> FilePathName(seg->mdfd_vfd))));
>       }
>  }
>  
> @@ -1399,6 +1414,8 @@ int
>  mdsyncfiletag(const FileTag *ftag, char *path)
>  {
>       SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId);
> +     instr_time      io_start,
> +                             io_time;
>       File            file;
>       bool            need_to_close;
>       int                     result,
> @@ -1425,10 +1442,22 @@ mdsyncfiletag(const FileTag *ftag, char *path)
>               need_to_close = true;
>       }
>  
> +     if (track_io_timing)
> +             INSTR_TIME_SET_CURRENT(io_start);
> +     else
> +             INSTR_TIME_SET_ZERO(io_start);
> +
>       /* Sync the file. */
>       result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
>       save_errno = errno;
>  
> +     if (track_io_timing)
> +     {
> +             INSTR_TIME_SET_CURRENT(io_time);
> +             INSTR_TIME_SUBTRACT(io_time, io_start);
> +             pgstat_count_io_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, 
> IOOP_FSYNC, io_time);
> +     }
> +
>       if (need_to_close)
>               FileClose(file);

Perhaps we could have mdsyncfd(), used by both mdsyncfiletag() and
register_dirty_segment()?



> @@ -1359,20 +1378,31 @@ pg_stat_get_io(PG_FUNCTION_ARGS)
>  
>                               for (int io_op = 0; io_op < IOOP_NUM_TYPES; 
> io_op++)
>                               {
> -                                     int                     col_idx = 
> pgstat_get_io_op_index(io_op);
> +                                     int                     i = 
> pgstat_get_io_op_index(io_op);
>  
>                                       /*
>                                        * Some combinations of BackendType and 
> IOOp, of IOContext
>                                        * and IOOp, and of IOObject and IOOp 
> are not tracked. Set
>                                        * these cells in the view NULL.
>                                        */
> -                                     nulls[col_idx] = 
> !pgstat_tracks_io_op(bktype, io_obj, io_context, io_op);
> +                                     if (pgstat_tracks_io_op(bktype, io_obj, 
> io_context, io_op))
> +                                             values[i] = 
> Int64GetDatum(bktype_stats->counts[io_obj][io_context][io_op]);
> +                                     else
> +                                             nulls[i] = true;
> +                             }

These lines were already too long, and it's getting worse with this change.


>  typedef struct PgStat_BktypeIO
>  {
> -     PgStat_Counter 
> data[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
> +     PgStat_Counter 
> counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
> +     instr_time      
> times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
>  } PgStat_BktypeIO;

Ah, you're going to hate me. We can't store instr_time on disk. There's
another patch that gets substantial peformance gains by varying the frequency
at which instr_time keeps track of time based on the CPU frequency... It also
just doesn't have enough range to keep track of system wide time on a larger
system. A single backend won't run for 293 years, but with a few thousand
backends that's a whole different story.

I think we need to accumulate in instr_time, but convert to floating point
when flushing stats.

Greetings,

Andres Freund

Re: Track IO times in pg_stat_io

Reply via email to