Fix IndexManager with many small segments IxManager_Choose_Sparse would throw when handling more than about 40 segments because of the integer overflow test in S_fibonacci.
Also, the naive recursive implementation of S_fibonacci had abysmal performance. After all, the result was created by adding only 0s and 1s. The number of calls to S_fibonacci when computing fib(n) was fib(n+1), resulting in more than a billion of iterations with n ~ 45. On my (heavily loaded and low-end) test machine, Choose_Sparse could take up to 40 seconds to complete. On the positive side, if anyone was hit by this performance problem, it's likely they also saw the "index n too large" exception and we only had two reports so far. Use a precomputed lookup table and fix the test for large values of n. Fixes LUCY-318. Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/e2e428f2 Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/e2e428f2 Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/e2e428f2 Branch: refs/heads/0.5 Commit: e2e428f292397d47e83b6a76c237791757ff830a Parents: 2671da3 Author: Nick Wellnhofer <wellnho...@aevum.de> Authored: Wed Dec 7 16:44:02 2016 +0100 Committer: Nick Wellnhofer <wellnho...@aevum.de> Committed: Wed Dec 7 17:28:53 2016 +0100 ---------------------------------------------------------------------- core/Lucy/Index/IndexManager.c | 69 +++++++++++++++++++++------- core/Lucy/Test/Index/TestIndexManager.c | 13 +++++- 2 files changed, 65 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/e2e428f2/core/Lucy/Index/IndexManager.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/IndexManager.c b/core/Lucy/Index/IndexManager.c index 7823499..b20b270 100644 --- a/core/Lucy/Index/IndexManager.c +++ b/core/Lucy/Index/IndexManager.c @@ -33,6 +33,56 @@ #include <stdlib.h> +static const int32_t S_fibonacci[47] = { + 0, + 1, + 1, + 2, + 3, + 5, + 8, + 13, + 21, + 34, + 55, + 89, + 144, + 233, + 377, + 610, + 987, + 1597, + 2584, + 4181, + 6765, + 10946, + 17711, + 28657, + 46368, + 75025, + 121393, + 196418, + 317811, + 514229, + 832040, + 1346269, + 2178309, + 3524578, + 5702887, + 9227465, + 14930352, + 24157817, + 39088169, + 63245986, + 102334155, + 165580141, + 267914296, + 433494437, + 701408733, + 1134903170, + 1836311903 +}; + IndexManager* IxManager_new(String *host, LockFactory *lock_factory) { IndexManager *self = (IndexManager*)Class_Make_Obj(INDEXMANAGER); @@ -116,21 +166,6 @@ S_compare_doc_count(const void *va, const void *vb) { return SegReader_Doc_Count(a) - SegReader_Doc_Count(b); } -static uint32_t -S_fibonacci(uint32_t n) { - uint32_t result = 0; - if (n > 46) { - THROW(ERR, "input %u32 too high", n); - } - else if (n < 2) { - result = n; - } - else { - result = S_fibonacci(n - 1) + S_fibonacci(n - 2); - } - return result; -} - Vector* IxManager_Recycle_IMP(IndexManager *self, PolyReader *reader, DeletionsWriter *del_writer, int64_t cutoff, @@ -199,7 +234,9 @@ IxManager_Choose_Sparse_IMP(IndexManager *self, I32Array *doc_counts) { for (uint32_t i = 0; i < num_candidates; i++) { uint32_t num_segs_when_done = num_candidates - threshold + 1; total_docs += I32Arr_Get(doc_counts, i); - if (total_docs < S_fibonacci(num_segs_when_done + 5)) { + uint32_t n = num_segs_when_done + 5; + if (n >= sizeof(S_fibonacci) / sizeof(S_fibonacci[0]) + || total_docs < S_fibonacci[n]) { threshold = i + 1; } } http://git-wip-us.apache.org/repos/asf/lucy/blob/e2e428f2/core/Lucy/Test/Index/TestIndexManager.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Test/Index/TestIndexManager.c b/core/Lucy/Test/Index/TestIndexManager.c index 87b1247..7bc0661 100644 --- a/core/Lucy/Test/Index/TestIndexManager.c +++ b/core/Lucy/Test/Index/TestIndexManager.c @@ -52,12 +52,23 @@ test_Choose_Sparse(TestBatchRunner *runner) { DECREF(doc_counts); } + { + size_t num_segs = 1000; + I32Array *doc_counts = I32Arr_new_blank(num_segs); + for (size_t j = 0; j < num_segs; j++) { + I32Arr_Set(doc_counts, j, 1); + } + uint32_t threshold = IxManager_Choose_Sparse(manager, doc_counts); + TEST_TRUE(runner, threshold > 500, "Many small segments"); + DECREF(doc_counts); + } + DECREF(manager); } void TestIxManager_Run_IMP(TestIndexManager *self, TestBatchRunner *runner) { - TestBatchRunner_Plan(runner, (TestBatch*)self, 34); + TestBatchRunner_Plan(runner, (TestBatch*)self, 35); test_Choose_Sparse(runner); }