Author: jplevyak
Date: Fri Apr 29 22:50:37 2011
New Revision: 1097979
URL: http://svn.apache.org/viewvc?rev=1097979&view=rev
Log:
TS-752: cache scan issues: fix: wbardwel, errors in integration: jplevyak.
Modified:
trafficserver/traffic/trunk/iocore/cache/CacheVol.cc
trafficserver/traffic/trunk/iocore/cache/P_CacheInternal.h
trafficserver/traffic/trunk/iocore/cache/P_CacheVol.h
Modified: trafficserver/traffic/trunk/iocore/cache/CacheVol.cc
URL:
http://svn.apache.org/viewvc/trafficserver/traffic/trunk/iocore/cache/CacheVol.cc?rev=1097979&r1=1097978&r2=1097979&view=diff
==============================================================================
--- trafficserver/traffic/trunk/iocore/cache/CacheVol.cc (original)
+++ trafficserver/traffic/trunk/iocore/cache/CacheVol.cc Fri Apr 29 22:50:37
2011
@@ -90,6 +90,66 @@ Ldone:
return free_CacheVC(this);
}
+/* Next block with some data in it in this partition. Returns end of
partition if no more
+ * locations.
+ *
+ * d - Vol
+ * vol_map - precalculated map
+ * offset - offset to start looking at (and data at this location has not been
read yet). */
+static off_t next_in_map(Vol *d, char *vol_map, off_t offset)
+{
+ off_t start_offset = vol_offset_to_offset(d, 0);
+ off_t new_off = (offset - start_offset);
+ off_t vol_len = vol_relative_length(d, start_offset);
+
+ while (new_off < vol_len && !vol_map[new_off / SCAN_BUF_SIZE]) new_off +=
SCAN_BUF_SIZE;
+ if (new_off >= vol_len) return vol_len + start_offset;
+ return new_off + start_offset;
+}
+
+// Function in CacheDir.cc that we need for make_vol_map().
+int
+dir_bucket_loop_fix(Dir *start_dir, int s, Vol *d);
+
+// TODO: If we used a bit vector, we could make a smaller map structure.
+// TODO: If we saved a high water mark we could have a smaller buf, and avoid
searching it
+// when we are asked about the highest interesting offset.
+/* Make map of what blocks in partition are used.
+ *
+ * d - Vol to make a map of. */
+static char *make_vol_map(Vol *d)
+{
+ // Map will be one byte for each SCAN_BUF_SIZE bytes.
+ off_t start_offset = vol_offset_to_offset(d, 0);
+ off_t vol_len = vol_relative_length(d, start_offset);
+ size_t map_len = (vol_len + (SCAN_BUF_SIZE - 1)) / SCAN_BUF_SIZE;
+ char *vol_map = (char *)xmalloc(map_len);
+ if (!vol_map) return NULL;
+ memset(vol_map, 0, map_len);
+
+ // Scan directories.
+ // Copied from dir_entries_used() and modified to fill in the map instead.
+ for (int s = 0; s < d->segments; s++) {
+ Dir *seg = dir_segment(s, d);
+ for (int b = 0; b < d->buckets; b++) {
+ Dir *e = dir_bucket(b, seg);
+ if (dir_bucket_loop_fix(e, s, d)) {
+ break;
+ }
+ while (e) {
+ if (dir_offset(e)) {
+ off_t offset = vol_offset(d, e) - start_offset;
+ if (offset <= vol_len) vol_map[offset / SCAN_BUF_SIZE] = 1;
+ }
+ e = next_dir(e, seg);
+ if (!e)
+ break;
+ }
+ }
+ }
+ return vol_map;
+}
+
int
CacheVC::scanObject(int event, Event * e)
{
@@ -97,8 +157,6 @@ CacheVC::scanObject(int event, Event * e
NOWARN_UNUSED(event);
Debug("cache_scan_truss", "inside %p:scanObject", this);
- if (_action.cancelled)
- return free_CacheVC(this);
Doc *doc = NULL;
void *result = NULL;
@@ -107,6 +165,8 @@ CacheVC::scanObject(int event, Event * e
char hname[500];
bool hostinfo_copied = false;
#endif
+ off_t next_object_len = 0;
+ bool might_need_overlap_read = false;
cancel_trigger();
set_io_not_in_progress();
@@ -115,17 +175,22 @@ CacheVC::scanObject(int event, Event * e
CACHE_TRY_LOCK(lock, vol->mutex, mutex->thread_holding);
if (!lock) {
+ Debug("cache_scan_truss", "delay %p:scanObject", this);
mutex->thread_holding->schedule_in_local(this,
HRTIME_MSECONDS(cache_config_mutex_retry_delay));
return EVENT_CONT;
}
if (!fragment) { // initialize for first read
fragment = 1;
- io.aiocb.aio_offset = vol_offset_to_offset(vol, 0);
+ scan_vol_map = make_vol_map(vol);
+ io.aiocb.aio_offset = next_in_map(vol, scan_vol_map,
vol_offset_to_offset(vol, 0));
+ if (io.aiocb.aio_offset >= (off_t)(vol->skip + vol->len))
+ goto Ldone;
io.aiocb.aio_nbytes = SCAN_BUF_SIZE;
io.aiocb.aio_buf = buf->data();
io.action = this;
io.thread = AIO_CALLBACK_THREAD_ANY;
+ Debug("cache_scan_truss", "read %p:scanObject", this);
goto Lread;
}
@@ -135,13 +200,26 @@ CacheVC::scanObject(int event, Event * e
}
doc = (Doc *) (buf->data() + offset);
- while ((char *) doc < buf->data() + io.aiocb.aio_nbytes) {
+ // If there is data in the buffer before the start that is from a partial
object read previously
+ // Fix things as if we read it this time.
+ if (scan_fix_buffer_offset) {
+ io.aio_result += scan_fix_buffer_offset;
+ io.aiocb.aio_nbytes += scan_fix_buffer_offset;
+ io.aiocb.aio_offset -= scan_fix_buffer_offset;
+ io.aiocb.aio_buf = (char *)io.aiocb.aio_buf - scan_fix_buffer_offset;
+ scan_fix_buffer_offset = 0;
+ }
+ while ((off_t)((char *) doc - buf->data()) + next_object_len <
(off_t)io.aiocb.aio_nbytes) {
+ might_need_overlap_read = false;
+ doc = (Doc *) ((char *) doc + next_object_len);
+ next_object_len = vol->round_to_approx_size(doc->len);
#ifdef HTTP_CACHE
int i;
bool changed;
if (doc->magic != DOC_MAGIC) {
- doc = (Doc *)((char *) doc + CACHE_BLOCK_SIZE);
+ next_object_len = CACHE_BLOCK_SIZE;
+ Debug("cache_scan_truss", "blockskip %p:scanObject", this);
continue;
}
@@ -157,8 +235,10 @@ CacheVC::scanObject(int event, Event * e
continue;
break;
}
- if (doc->data() - buf->data() > (int) io.aiocb.aio_nbytes)
+ if (doc->data() - buf->data() > (int) io.aiocb.aio_nbytes) {
+ might_need_overlap_read = true;
goto Lskip;
+ }
{
char *tmp = doc->hdr();
int len = doc->hlen;
@@ -178,7 +258,7 @@ CacheVC::scanObject(int event, Event * e
hostinfo_copied = 0;
for (i = 0; i < vector.count(); i++) {
if (!vector.get(i)->valid())
- continue;
+ goto Lskip;
if (!hostinfo_copied) {
memccpy(hname,
vector.get(i)->request_get()->url_get()->host_get(&hlen), 0, 500);
hname[hlen] = 0;
@@ -243,16 +323,35 @@ CacheVC::scanObject(int event, Event * e
return scanOpenWrite(EVENT_NONE, 0);
}
}
- doc = (Doc *) ((char *) doc + vol->round_to_approx_size(doc->len));
continue;
- Lskip:
+ Lskip:;
#endif
- doc = (Doc *) ((char *) doc + vol->round_to_approx_size(doc->len));
}
#ifdef HTTP_CACHE
vector.clear();
#endif
- io.aiocb.aio_offset += (char *) doc - buf->data();
+ // If we had an object that went past the end of the buffer, and it is
small enough to fix,
+ // fix it.
+ if (might_need_overlap_read &&
+ ((off_t)((char *) doc - buf->data()) + next_object_len >
(off_t)io.aiocb.aio_nbytes) &&
+ next_object_len > 0) {
+ off_t partial_object_len = io.aiocb.aio_nbytes - ((char *)doc -
buf->data());
+ // Copy partial object to beginning of the buffer.
+ memmove(buf->data(), (char *)doc, partial_object_len);
+ io.aiocb.aio_offset += io.aiocb.aio_nbytes;
+ io.aiocb.aio_nbytes = SCAN_BUF_SIZE - partial_object_len;
+ io.aiocb.aio_buf = buf->data() + partial_object_len;
+ scan_fix_buffer_offset = partial_object_len;
+ } else { // Normal case, where we ended on a object boundary.
+ io.aiocb.aio_offset += ((char *)doc - buf->data()) + next_object_len;
+ Debug("cache_scan_truss", "next %p:scanObject %lld", this,
io.aiocb.aio_offset);
+ io.aiocb.aio_offset = next_in_map(vol, scan_vol_map, io.aiocb.aio_offset);
+ Debug("cache_scan_truss", "next_in_map %p:scanObject %lld", this,
io.aiocb.aio_offset);
+ io.aiocb.aio_nbytes = SCAN_BUF_SIZE;
+ io.aiocb.aio_buf = buf->data();
+ scan_fix_buffer_offset = 0;
+ }
+
if (io.aiocb.aio_offset >= vol->skip + vol->len) {
SET_HANDLER(&CacheVC::scanVol);
eventProcessor.schedule_in(this, HRTIME_MSECONDS(scan_msec_delay));
@@ -263,13 +362,14 @@ Lread:
io.aiocb.aio_fildes = vol->fd;
if ((off_t)(io.aiocb.aio_offset + io.aiocb.aio_nbytes) > (off_t)(vol->skip +
vol->len))
io.aiocb.aio_nbytes = vol->skip + vol->len - io.aiocb.aio_offset;
- else
- io.aiocb.aio_nbytes = SCAN_BUF_SIZE;
offset = 0;
ink_assert(ink_aio_read(&io) >= 0);
+ Debug("cache_scan_truss", "read %p:scanObject %lld %lld", this,
+ (off_t)io.aiocb.aio_offset, (off_t)io.aiocb.aio_nbytes);
return EVENT_CONT;
Ldone:
+ Debug("cache_scan_truss", "done %p:scanObject", this);
_action.continuation->handleEvent(CACHE_EVENT_SCAN_DONE, result);
#ifdef HTTP_CACHE
Lcancel:
@@ -315,8 +415,10 @@ CacheVC::scanOpenWrite(int event, Event
int ret = 0;
{
CACHE_TRY_LOCK(lock, vol->mutex, mutex->thread_holding);
- if (!lock)
+ if (!lock) {
+ Debug("cache_scan", "vol->mutex %p:scanOpenWrite", this);
VC_SCHED_LOCK_RETRY();
+ }
Debug("cache_scan", "trying for writer lock");
if (vol->open_write(this, false, 1)) {
Modified: trafficserver/traffic/trunk/iocore/cache/P_CacheInternal.h
URL:
http://svn.apache.org/viewvc/trafficserver/traffic/trunk/iocore/cache/P_CacheInternal.h?rev=1097979&r1=1097978&r2=1097979&view=diff
==============================================================================
--- trafficserver/traffic/trunk/iocore/cache/P_CacheInternal.h (original)
+++ trafficserver/traffic/trunk/iocore/cache/P_CacheInternal.h Fri Apr 29
22:50:37 2011
@@ -481,6 +481,12 @@ struct CacheVC: public CacheVConnection
#endif
} f;
};
+ // BTF optimization used to skip reading stuff in cache partition that
doesn't contain any
+ // dir entries.
+ char *scan_vol_map;
+ // BTF fix to handle objects that overlapped over two different reads,
+ // this is how much we need to back up the buffer to get the start of the
overlapping object.
+ off_t scan_fix_buffer_offset;
//end region C
};
@@ -588,6 +594,8 @@ free_CacheVC(CacheVC *cont)
cont->alternate_index = CACHE_ALT_INDEX_DEFAULT;
if (cont->frag && cont->frag != cont->integral_frags)
xfree(cont->frag);
+ if (cont->scan_vol_map)
+ xfree(cont->scan_vol_map);
memset((char *) &cont->vio, 0, cont->size_to_init);
#ifdef CACHE_STAT_PAGES
ink_assert(!cont->stat_link.next && !cont->stat_link.prev);
Modified: trafficserver/traffic/trunk/iocore/cache/P_CacheVol.h
URL:
http://svn.apache.org/viewvc/trafficserver/traffic/trunk/iocore/cache/P_CacheVol.h?rev=1097979&r1=1097978&r2=1097979&view=diff
==============================================================================
--- trafficserver/traffic/trunk/iocore/cache/P_CacheVol.h (original)
+++ trafficserver/traffic/trunk/iocore/cache/P_CacheVol.h Fri Apr 29 22:50:37
2011
@@ -411,6 +411,13 @@ vol_in_phase_agg_buf_valid(Vol *d, Dir *
return (vol_offset(d, e) >= d->header->write_pos && vol_offset(d, e) <
(d->header->write_pos + d->agg_buf_pos));
}
+// length of the partition not including the offset of location 0.
+TS_INLINE off_t
+vol_relative_length(Vol *v, off_t start_offset)
+{
+ return (v->len + v->skip) - start_offset;
+}
+
TS_INLINE uint32_t
Doc::prefix_len()
{