Repository: lucy Updated Branches: refs/heads/master 488d6f4cf -> 291a32693
Avoid rescanning the index folder for latest snapshot Eliminate up to four unnecessary scans in BackgroudMerger: - BGMerger_init - S_merge_updated_deletions (twice) - BGMerger_Prepare_Commit Eliminate two unnecessary scans in Indexer: - Indexer_init - Indexer_Prepare_Commit Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/291a3269 Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/291a3269 Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/291a3269 Branch: refs/heads/master Commit: 291a3269312a29792f8294fdf1b1f68dbfd94a79 Parents: 488d6f4 Author: Nick Wellnhofer <wellnho...@aevum.de> Authored: Wed Feb 22 18:55:05 2017 +0100 Committer: Nick Wellnhofer <wellnho...@aevum.de> Committed: Sat Mar 4 18:28:32 2017 +0100 ---------------------------------------------------------------------- core/Lucy/Index/BackgroundMerger.c | 43 +++++++++++++++------------------ core/Lucy/Index/IndexManager.c | 27 --------------------- core/Lucy/Index/IndexManager.cfh | 7 ------ core/Lucy/Index/Indexer.c | 25 ++++++++++--------- core/Lucy/Index/Indexer.cfh | 1 + core/Lucy/Util/IndexFileNames.c | 8 ++++++ core/Lucy/Util/IndexFileNames.cfh | 6 +++++ go/build.go | 1 - go/lucy/index.go | 12 --------- go/lucy/index_test.go | 5 ---- 10 files changed, 48 insertions(+), 87 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/core/Lucy/Index/BackgroundMerger.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/BackgroundMerger.c b/core/Lucy/Index/BackgroundMerger.c index c729bda..409571b 100644 --- a/core/Lucy/Index/BackgroundMerger.c +++ b/core/Lucy/Index/BackgroundMerger.c @@ -115,7 +115,8 @@ BGMerger_init(BackgroundMerger *self, Obj *index, IndexManager *manager) { // Open a PolyReader, passing in the IndexManager so we get a read lock on // the Snapshot's files -- so that Indexers don't zap our files while // we're operating in the background. - ivars->polyreader = PolyReader_open((Obj*)folder, NULL, ivars->manager); + ivars->polyreader + = PolyReader_open((Obj*)folder, ivars->snapshot, ivars->manager); // Clone the PolyReader's schema. Obj *dump = (Obj*)Schema_Dump(PolyReader_Get_Schema(ivars->polyreader)); @@ -249,12 +250,12 @@ S_maybe_merge(BackgroundMerger *self) { } static bool -S_merge_updated_deletions(BackgroundMerger *self) { +S_merge_updated_deletions(BackgroundMerger *self, Snapshot *latest_snapshot) { BackgroundMergerIVARS *const ivars = BGMerger_IVARS(self); Hash *updated_deletions = NULL; PolyReader *new_polyreader - = PolyReader_open((Obj*)ivars->folder, NULL, NULL); + = PolyReader_open((Obj*)ivars->folder, latest_snapshot, NULL); Vector *new_seg_readers = PolyReader_Get_Seg_Readers(new_polyreader); Vector *old_seg_readers @@ -305,8 +306,6 @@ S_merge_updated_deletions(BackgroundMerger *self) { = PolyReader_open((Obj*)ivars->folder, ivars->snapshot, NULL); Vector *merge_seg_readers = PolyReader_Get_Seg_Readers(merge_polyreader); - Snapshot *latest_snapshot - = Snapshot_Read_File(Snapshot_new(), ivars->folder, NULL); int64_t new_seg_num = IxManager_Highest_Seg_Num(ivars->manager, latest_snapshot) + 1; Segment *new_segment = Seg_new(new_seg_num); @@ -360,7 +359,6 @@ S_merge_updated_deletions(BackgroundMerger *self) { SegWriter_Finish(seg_writer); DECREF(seg_writer); DECREF(new_segment); - DECREF(latest_snapshot); DECREF(merge_polyreader); DECREF(updated_deletions); } @@ -411,33 +409,27 @@ BGMerger_Prepare_Commit_IMP(BackgroundMerger *self) { RETHROW(INCREF(Err_get_error())); } - // Write temporary snapshot file. - DECREF(ivars->snapfile); - String *snapfile = IxManager_Make_Snapshot_Filename(ivars->manager); - ivars->snapfile = Str_Cat_Trusted_Utf8(snapfile, ".temp", 5); - DECREF(snapfile); - Folder_Delete(folder, ivars->snapfile); - Snapshot_Write_File(snapshot, folder, ivars->snapfile); - // Determine whether the index has been updated while this background // merge process was running. String *start_snapfile = Snapshot_Get_Path(PolyReader_Get_Snapshot(ivars->polyreader)); - Snapshot *latest_snapshot - = Snapshot_Read_File(Snapshot_new(), ivars->folder, NULL); - String *latest_snapfile = Snapshot_Get_Path(latest_snapshot); + String *latest_snapfile = IxFileNames_latest_snapshot(ivars->folder); bool index_updated = !Str_Equals(start_snapfile, (Obj*)latest_snapfile); if (index_updated) { + Snapshot *latest_snapshot + = Snapshot_Read_File(Snapshot_new(), ivars->folder, + latest_snapfile); + /* See if new deletions have been applied since this * background merge process started against any of the * segments we just merged away. If that's true, we need to * write another segment which applies the deletions against * the new composite segment. */ - S_merge_updated_deletions(self); + S_merge_updated_deletions(self, latest_snapshot); // Add the fresh content to our snapshot. (It's important to // run this AFTER S_merge_updated_deletions, because otherwise @@ -452,16 +444,21 @@ BGMerger_Prepare_Commit_IMP(BackgroundMerger *self) { } } } - DECREF(files); - // Since the snapshot content has changed, we need to rewrite it. - Folder_Delete(folder, ivars->snapfile); - Snapshot_Write_File(snapshot, folder, ivars->snapfile); + DECREF(files); + DECREF(latest_snapshot); } - DECREF(latest_snapshot); + // Write temporary snapshot file. + DECREF(ivars->snapfile); + uint64_t gen = IxFileNames_extract_gen(latest_snapfile); + ivars->snapfile = IxFileNames_make_temp_snapshot(gen + 1); + Folder_Delete(folder, ivars->snapfile); + Snapshot_Write_File(snapshot, folder, ivars->snapfile); ivars->needs_commit = true; + + DECREF(latest_snapfile); } // Close reader, so that we can delete its files if appropriate. http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/core/Lucy/Index/IndexManager.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/IndexManager.c b/core/Lucy/Index/IndexManager.c index 15367fa..24eed1d 100644 --- a/core/Lucy/Index/IndexManager.c +++ b/core/Lucy/Index/IndexManager.c @@ -29,7 +29,6 @@ #include "Lucy/Store/LockFactory.h" #include "Lucy/Util/IndexFileNames.h" #include "Lucy/Util/Json.h" -#include "Lucy/Util/StringHelper.h" #include <stdlib.h> @@ -133,32 +132,6 @@ IxManager_Highest_Seg_Num_IMP(IndexManager *self, Snapshot *snapshot) { return (int64_t)highest_seg_num; } -String* -IxManager_Make_Snapshot_Filename_IMP(IndexManager *self) { - IndexManagerIVARS *const ivars = IxManager_IVARS(self); - Folder *folder = (Folder*)CERTIFY(ivars->folder, FOLDER); - DirHandle *dh = Folder_Open_Dir(folder, NULL); - uint64_t max_gen = 0; - - if (!dh) { RETHROW(INCREF(Err_get_error())); } - while (DH_Next(dh)) { - String *entry = DH_Get_Entry(dh); - if (Str_Starts_With_Utf8(entry, "snapshot_", 9) - && Str_Ends_With_Utf8(entry, ".json", 5) - ) { - uint64_t gen = IxFileNames_extract_gen(entry); - if (gen > max_gen) { max_gen = gen; } - } - DECREF(entry); - } - DECREF(dh); - - uint64_t new_gen = max_gen + 1; - char base36[StrHelp_MAX_BASE36_BYTES]; - StrHelp_to_base36(new_gen, &base36); - return Str_newf("snapshot_%s.json", &base36); -} - static int S_compare_doc_count(const void *va, const void *vb) { SegReader *a = *(SegReader**)va; http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/core/Lucy/Index/IndexManager.cfh ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/IndexManager.cfh b/core/Lucy/Index/IndexManager.cfh index be6389a..f00cdc5 100644 --- a/core/Lucy/Index/IndexManager.cfh +++ b/core/Lucy/Index/IndexManager.cfh @@ -145,13 +145,6 @@ public class Lucy::Index::IndexManager nickname IxManager int64_t Highest_Seg_Num(IndexManager *self, Snapshot *snapshot); - /** Return the name of a new snapshot file, which shall contain a base-36 - * "generation" embedded inside it greater than the generation of any - * snapshot file currently in the index folder. - */ - incremented String* - Make_Snapshot_Filename(IndexManager *self); - /** Setter for write lock timeout. Default: 1000 milliseconds. */ public void http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/core/Lucy/Index/Indexer.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Indexer.c b/core/Lucy/Index/Indexer.c index 16b915b..05ee6d3 100644 --- a/core/Lucy/Index/Indexer.c +++ b/core/Lucy/Index/Indexer.c @@ -109,9 +109,9 @@ Indexer_init(Indexer *self, Schema *schema, Obj *index, } // Find the latest snapshot or create a new one. - String *latest_snapfile = IxFileNames_latest_snapshot(folder); - if (latest_snapfile) { - Snapshot_Read_File(latest_snapshot, folder, latest_snapfile); + ivars->latest_snapfile = IxFileNames_latest_snapshot(folder); + if (ivars->latest_snapfile) { + Snapshot_Read_File(latest_snapshot, folder, ivars->latest_snapfile); } // Look for an existing Schema if one wasn't supplied. @@ -119,7 +119,7 @@ Indexer_init(Indexer *self, Schema *schema, Obj *index, ivars->schema = (Schema*)INCREF(schema); } else { - if (!latest_snapfile) { + if (!ivars->latest_snapfile) { S_release_write_lock(self); THROW(ERR, "No Schema supplied, and can't find one in the index"); } @@ -147,13 +147,13 @@ Indexer_init(Indexer *self, Schema *schema, Obj *index, ivars->truncate = true; } else { - // TODO: clone most recent snapshot rather than read it twice. ivars->snapshot = (Snapshot*)INCREF(latest_snapshot); - ivars->polyreader = latest_snapfile - ? PolyReader_open((Obj*)folder, NULL, NULL) + ivars->polyreader = ivars->latest_snapfile + ? PolyReader_open((Obj*)folder, latest_snapshot, + NULL) : PolyReader_new(schema, folder, NULL, NULL, NULL); - if (latest_snapfile) { + if (ivars->latest_snapfile) { // Make sure than any existing fields which may have been // dynamically added during past indexing sessions get added. Schema *old_schema = PolyReader_Get_Schema(ivars->polyreader); @@ -215,7 +215,6 @@ Indexer_init(Indexer *self, Schema *schema, Obj *index, ivars->del_writer = (DeletionsWriter*)INCREF( SegWriter_Get_Del_Writer(ivars->seg_writer)); - DECREF(latest_snapfile); DECREF(latest_snapshot); return self; @@ -231,6 +230,7 @@ Indexer_Destroy_IMP(Indexer *self) { DECREF(ivars->segment); DECREF(ivars->manager); DECREF(ivars->stock_doc); + DECREF(ivars->latest_snapfile); DECREF(ivars->polyreader); DECREF(ivars->del_writer); DECREF(ivars->snapshot); @@ -509,9 +509,10 @@ Indexer_Prepare_Commit_IMP(Indexer *self) { // Derive snapshot and schema file names. DECREF(ivars->snapfile); - String *snapfile = IxManager_Make_Snapshot_Filename(ivars->manager); - ivars->snapfile = Str_Cat_Trusted_Utf8(snapfile, ".temp", 5); - DECREF(snapfile); + uint64_t gen = ivars->latest_snapfile + ? IxFileNames_extract_gen(ivars->latest_snapfile) + : 0; + ivars->snapfile = IxFileNames_make_temp_snapshot(gen + 1); uint64_t schema_gen = IxFileNames_extract_gen(ivars->snapfile); char base36[StrHelp_MAX_BASE36_BYTES]; StrHelp_to_base36(schema_gen, &base36); http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/core/Lucy/Index/Indexer.cfh ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Indexer.cfh b/core/Lucy/Index/Indexer.cfh index 5a01f8b..3f75901 100644 --- a/core/Lucy/Index/Indexer.cfh +++ b/core/Lucy/Index/Indexer.cfh @@ -43,6 +43,7 @@ public class Lucy::Index::Indexer inherits Clownfish::Obj { Folder *folder; Segment *segment; IndexManager *manager; + String *latest_snapfile; PolyReader *polyreader; Snapshot *snapshot; SegWriter *seg_writer; http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/core/Lucy/Util/IndexFileNames.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Util/IndexFileNames.c b/core/Lucy/Util/IndexFileNames.c index a1a063b..2298b3b 100644 --- a/core/Lucy/Util/IndexFileNames.c +++ b/core/Lucy/Util/IndexFileNames.c @@ -20,6 +20,7 @@ #include "Lucy/Util/IndexFileNames.h" #include "Lucy/Store/DirHandle.h" #include "Lucy/Store/Folder.h" +#include "Lucy/Util/StringHelper.h" String* IxFileNames_latest_snapshot(Folder *folder) { @@ -69,6 +70,13 @@ IxFileNames_extract_gen(String *name) { } String* +IxFileNames_make_temp_snapshot(uint64_t gen) { + char base36[StrHelp_MAX_BASE36_BYTES]; + StrHelp_to_base36(gen, &base36); + return Str_newf("snapshot_%s.json.temp", &base36); +} + +String* IxFileNames_local_part(String *path) { StringIterator *top = Str_Tail(path); int32_t code_point = StrIter_Prev(top); http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/core/Lucy/Util/IndexFileNames.cfh ---------------------------------------------------------------------- diff --git a/core/Lucy/Util/IndexFileNames.cfh b/core/Lucy/Util/IndexFileNames.cfh index e1bbff3..ea85a9a 100644 --- a/core/Lucy/Util/IndexFileNames.cfh +++ b/core/Lucy/Util/IndexFileNames.cfh @@ -35,6 +35,12 @@ inert class Lucy::Util::IndexFileNames nickname IxFileNames { inert incremented nullable String* latest_snapshot(Folder *folder); + /** Return the name of a new temporary snapshot file, which shall contain + * a base-36 "generation" embedded inside it. + */ + inert incremented String* + make_temp_snapshot(uint64_t gen); + /** Split the `path` on '/' and return the last component. * Trailing slashes will be stripped. */ http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/go/build.go ---------------------------------------------------------------------- diff --git a/go/build.go b/go/build.go index b751ba5..553b7a9 100644 --- a/go/build.go +++ b/go/build.go @@ -259,7 +259,6 @@ func specClasses(parcel *cfc.Parcel) { managerBinding.SpecMethod("Write_Merge_Data", "WriteMergeData(int64) error") managerBinding.SpecMethod("Read_Merge_Data", "ReadMergeData() (map[string]interface{}, error)") managerBinding.SpecMethod("Remove_Merge_Data", "RemoveMergeData() error") - managerBinding.SpecMethod("Make_Snapshot_Filename", "MakeSnapshotFilename() (string, error)") managerBinding.SpecMethod("Recycle", "Recycle(PolyReader, DeletionsWriter, int64, bool) ([]SegReader, error)") managerBinding.Register() http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/go/lucy/index.go ---------------------------------------------------------------------- diff --git a/go/lucy/index.go b/go/lucy/index.go index 6f627f0..6e8142e 100644 --- a/go/lucy/index.go +++ b/go/lucy/index.go @@ -396,18 +396,6 @@ func (im *IndexManagerIMP) RemoveMergeData() error { }) } -func (im *IndexManagerIMP) MakeSnapshotFilename() (retval string, err error) { - err = clownfish.TrapErr(func() { - self := (*C.lucy_IndexManager)(clownfish.Unwrap(im, "im")) - retvalC := C.LUCY_IxManager_Make_Snapshot_Filename(self) - if retvalC != nil { - defer C.cfish_decref(unsafe.Pointer(retvalC)) - retval = clownfish.ToGo(unsafe.Pointer(retvalC)).(string) - } - }) - return retval, err -} - func (im *IndexManagerIMP) Recycle(reader PolyReader, delWriter DeletionsWriter, cutoff int64, optimize bool) (retval []SegReader, err error) { err = clownfish.TrapErr(func() { http://git-wip-us.apache.org/repos/asf/lucy/blob/291a3269/go/lucy/index_test.go ---------------------------------------------------------------------- diff --git a/go/lucy/index_test.go b/go/lucy/index_test.go index b8823e8..0049ee4 100644 --- a/go/lucy/index_test.go +++ b/go/lucy/index_test.go @@ -19,7 +19,6 @@ package lucy import "testing" import "os" import "reflect" -import "strings" import "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish" @@ -224,10 +223,6 @@ func TestIndexManagerMergeData(t *testing.T) { func TestIndexManagerMisc(t *testing.T) { manager := NewIndexManager("", nil) - manager.SetFolder(NewRAMFolder("")) - if got, err := manager.MakeSnapshotFilename(); !strings.Contains(got, "snapshot") || err != nil { - t.Errorf("MakeSnapshotFilename: %s, %v", got, err) - } snapshot := NewSnapshot() snapshot.AddEntry("seg_4") snapshot.AddEntry("seg_5")