Port Facet.Sampling
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/ff4fe045 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/ff4fe045 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/ff4fe045 Branch: refs/heads/branch_4x Commit: ff4fe04539f3cba515d09ab67a86e000069326eb Parents: 71e218c Author: Paul Irwin <[email protected]> Authored: Wed Nov 6 11:35:59 2013 -0500 Committer: Paul Irwin <[email protected]> Committed: Wed Nov 6 11:35:59 2013 -0500 ---------------------------------------------------------------------- src/contrib/Facet/Contrib.Facet.csproj | 9 + src/contrib/Facet/Sampling/ISampleFixer.cs | 13 + src/contrib/Facet/Sampling/RandomSampler.cs | 56 ++++ src/contrib/Facet/Sampling/RepeatableSampler.cs | 294 +++++++++++++++++++ src/contrib/Facet/Sampling/Sampler.cs | 170 +++++++++++ .../Facet/Sampling/SamplingAccumulator.cs | 72 +++++ src/contrib/Facet/Sampling/SamplingParams.cs | 86 ++++++ src/contrib/Facet/Sampling/SamplingWrapper.cs | 58 ++++ src/contrib/Facet/Sampling/TakmiSampleFixer.cs | 125 ++++++++ src/contrib/Facet/Search/DrillDownQuery.cs | 201 +++++++++++++ src/core/Lucene.Net.csproj | 2 + src/core/Support/DateTimeExtensions.cs | 17 ++ src/core/Support/IDictionaryExtensions.cs | 18 ++ 13 files changed, 1121 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Contrib.Facet.csproj ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Contrib.Facet.csproj b/src/contrib/Facet/Contrib.Facet.csproj index 5c15e5b..9f15e12 100644 --- a/src/contrib/Facet/Contrib.Facet.csproj +++ b/src/contrib/Facet/Contrib.Facet.csproj @@ -86,11 +86,20 @@ <Compile Include="Partitions\IIntermediateFacetResult.cs" /> <Compile Include="Partitions\PartitionsFacetResultsHandler.cs" /> <Compile Include="Properties\AssemblyInfo.cs" /> + <Compile Include="Sampling\ISampleFixer.cs" /> + <Compile Include="Sampling\RandomSampler.cs" /> + <Compile Include="Sampling\RepeatableSampler.cs" /> + <Compile Include="Sampling\Sampler.cs" /> + <Compile Include="Sampling\SamplingAccumulator.cs" /> + <Compile Include="Sampling\SamplingParams.cs" /> + <Compile Include="Sampling\SamplingWrapper.cs" /> + <Compile Include="Sampling\TakmiSampleFixer.cs" /> <Compile Include="Search\CountFacetRequest.cs" /> <Compile Include="Search\CountingAggregator.cs" /> <Compile Include="Search\CountingFacetsAggregator.cs" /> <Compile Include="Search\DepthOneFacetResultsHandler.cs" /> <Compile Include="Search\DocValuesCategoryListIterator.cs" /> + <Compile Include="Search\DrillDownQuery.cs" /> <Compile Include="Search\FacetArrays.cs" /> <Compile Include="Search\FacetRequest.cs" /> <Compile Include="Search\FacetResult.cs" /> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/ISampleFixer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Sampling/ISampleFixer.cs b/src/contrib/Facet/Sampling/ISampleFixer.cs new file mode 100644 index 0000000..57146b5 --- /dev/null +++ b/src/contrib/Facet/Sampling/ISampleFixer.cs @@ -0,0 +1,13 @@ +using Lucene.Net.Facet.Search; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.Sampling +{ + public interface ISampleFixer + { + void FixResult(IScoredDocIDs origDocIds, FacetResult fres); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/RandomSampler.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Sampling/RandomSampler.cs b/src/contrib/Facet/Sampling/RandomSampler.cs new file mode 100644 index 0000000..1dc7f1c --- /dev/null +++ b/src/contrib/Facet/Sampling/RandomSampler.cs @@ -0,0 +1,56 @@ +using Lucene.Net.Facet.Search; +using Lucene.Net.Facet.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.Sampling +{ + public class RandomSampler : Sampler + { + private readonly Random random; + + public RandomSampler() + : base() + { + this.random = new Random(); + } + + public RandomSampler(SamplingParams params_renamed, Random random) + : base(params_renamed) + { + this.random = random; + } + + protected override SampleResult CreateSample(IScoredDocIDs docids, int actualSize, int sampleSetSize) + { + int[] sample = new int[sampleSetSize]; + int maxStep = (actualSize * 2) / sampleSetSize; + int remaining = actualSize; + IScoredDocIDsIterator it = docids.Iterator(); + int i = 0; + while (i < sample.Length && remaining > (sampleSetSize - maxStep - i)) + { + int skipStep = 1 + random.Next(maxStep); + for (int j = 0; j < skipStep; j++) + { + it.Next(); + --remaining; + } + + sample[i++] = it.DocID; + } + + while (i < sample.Length) + { + it.Next(); + sample[i++] = it.DocID; + } + + IScoredDocIDs sampleRes = ScoredDocIdsUtils.CreateScoredDocIDsSubset(docids, sample); + SampleResult res = new SampleResult(sampleRes, sampleSetSize / (double)actualSize); + return res; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/RepeatableSampler.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Sampling/RepeatableSampler.cs b/src/contrib/Facet/Sampling/RepeatableSampler.cs new file mode 100644 index 0000000..d0f10ee --- /dev/null +++ b/src/contrib/Facet/Sampling/RepeatableSampler.cs @@ -0,0 +1,294 @@ +using Lucene.Net.Facet.Search; +using Lucene.Net.Facet.Util; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.Sampling +{ + public class RepeatableSampler : Sampler + { + public RepeatableSampler(SamplingParams params_renamed) + : base(params_renamed) + { + } + + protected override SampleResult CreateSample(IScoredDocIDs docids, int actualSize, int sampleSetSize) + { + int[] sampleSet = null; + try + { + sampleSet = RepeatableSample(docids, actualSize, sampleSetSize); + } + catch (IOException e) + { + Trace.TraceWarning(@"sampling failed: " + e.Message + @" - falling back to no sampling!", e); + + return new SampleResult(docids, 1.0); + } + + IScoredDocIDs sampled = ScoredDocIdsUtils.CreateScoredDocIDsSubset(docids, sampleSet); + Debug.WriteLine(@"******************** " + sampled.Size); + + return new SampleResult(sampled, sampled.Size / (double)docids.Size); + } + + private static int[] RepeatableSample(IScoredDocIDs collection, int collectionSize, int sampleSize) + { + return RepeatableSample(collection, collectionSize, sampleSize, Algorithm.HASHING, Sorted.NO); + } + + private static int[] RepeatableSample(IScoredDocIDs collection, int collectionSize, int sampleSize, Algorithm algorithm, Sorted sorted) + { + if (collection == null) + { + throw new IOException(@"docIdSet is null"); + } + + if (sampleSize < 1) + { + throw new IOException(@"sampleSize < 1 (" + sampleSize + @")"); + } + + if (collectionSize < sampleSize) + { + throw new IOException(@"collectionSize (" + collectionSize + @") less than sampleSize (" + sampleSize + @")"); + } + + int[] sample = new int[sampleSize]; + long[] times = new long[4]; + if (algorithm == Algorithm.TRAVERSAL) + { + Sample1(collection, collectionSize, sample, times); + } + else if (algorithm == Algorithm.HASHING) + { + Sample2(collection, collectionSize, sample, times); + } + else + { + throw new ArgumentException(@"Invalid algorithm selection"); + } + + if (sorted == Sorted.YES) + { + Array.Sort(sample); + } + + if (returnTimings) + { + times[3] = DateTime.UtcNow.CurrentTimeMillis(); + Debug.WriteLine(@"Times: " + (times[1] - times[0]) + @"ms, " + (times[2] - times[1]) + @"ms, " + (times[3] - times[2]) + @"ms"); + } + + return sample; + } + + private static void Sample1(IScoredDocIDs collection, int collectionSize, int[] sample, long[] times) + { + IScoredDocIDsIterator it = collection.Iterator(); + if (returnTimings) + { + times[0] = DateTime.UtcNow.CurrentTimeMillis(); + } + + int sampleSize = sample.Length; + int prime = FindGoodStepSize(collectionSize, sampleSize); + int mod = prime % collectionSize; + if (returnTimings) + { + times[1] = DateTime.UtcNow.CurrentTimeMillis(); + } + + int sampleCount = 0; + int index = 0; + for (; sampleCount < sampleSize; ) + { + if (index + mod < collectionSize) + { + for (int i = 0; i < mod; i++, index++) + { + it.Next(); + } + } + else + { + index = index + mod - collectionSize; + it = collection.Iterator(); + for (int i = 0; i < index; i++) + { + it.Next(); + } + } + + sample[sampleCount++] = it.DocID; + } + + if (returnTimings) + { + times[2] = DateTime.UtcNow.CurrentTimeMillis(); + } + } + + private static int FindGoodStepSize(int collectionSize, int sampleSize) + { + int i = (int)Math.Sqrt(collectionSize); + if (sampleSize < i) + { + i = collectionSize / sampleSize; + } + + do + { + i = FindNextPrimeAfter(i); + } + while (collectionSize % i == 0); + return i; + } + + private static int FindNextPrimeAfter(int n) + { + n += (n % 2 == 0) ? 1 : 2; + + for (; ; n += 2) + { + bool shouldContinueOuter = false; + + int sri = (int)(Math.Sqrt(n)); + + for (int primeIndex = 0; primeIndex < N_PRIMES; primeIndex++) + { + int p = primes[primeIndex]; + if (p > sri) + { + return n; + } + + if (n % p == 0) + { + shouldContinueOuter = true; + break; + } + } + + if (shouldContinueOuter) + continue; + + for (int p = primes[N_PRIMES - 1] + 2; ; p += 2) + { + if (p > sri) + { + return n; + } + + if (n % p == 0) + { + shouldContinueOuter = true; + break; + } + } + + if (shouldContinueOuter) + continue; + } + } + + private static readonly int N_PRIMES = 4000; + private static int[] primes = new int[N_PRIMES]; + static RepeatableSampler() + { + primes[0] = 3; + for (int count = 1; count < N_PRIMES; count++) + { + primes[count] = FindNextPrimeAfter(primes[count - 1]); + } + } + + private static void Sample2(IScoredDocIDs collection, int collectionSize, int[] sample, long[] times) + { + if (returnTimings) + { + times[0] = DateTime.UtcNow.CurrentTimeMillis(); + } + + int sampleSize = sample.Length; + IntPriorityQueue pq = new IntPriorityQueue(sampleSize); + IScoredDocIDsIterator it = collection.Iterator(); + MI mi = null; + while (it.Next()) + { + if (mi == null) + { + mi = new MI(); + } + + mi.value = (int)(it.DocID * PHI_32) & 0x7FFFFFFF; + mi = pq.InsertWithOverflow(mi); + } + + if (returnTimings) + { + times[1] = DateTime.UtcNow.CurrentTimeMillis(); + } + + Object[] heap = pq.GetHeap(); + for (int si = 0; si < sampleSize; si++) + { + sample[si] = (int)(((MI)heap[si + 1]).value * PHI_32I) & 0x7FFFFFFF; + } + + if (returnTimings) + { + times[2] = DateTime.UtcNow.CurrentTimeMillis(); + } + } + + private class MI + { + internal MI() + { + } + + public int value; + } + + private class IntPriorityQueue : Lucene.Net.Util.PriorityQueue<MI> + { + public IntPriorityQueue(int size) + : base(size) + { + } + + public virtual Object[] GetHeap() + { + return GetHeapArray(); + } + + public override bool LessThan(MI o1, MI o2) + { + return o1.value < o2.value; + } + } + + private enum Algorithm + { + TRAVERSAL, + HASHING + } + + private enum Sorted + { + YES, + NO + } + + private static readonly long PHI_32 = 2654435769L; + private static readonly long PHI_32I = 340573321L; + private static bool returnTimings = false; + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/Sampler.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Sampling/Sampler.cs b/src/contrib/Facet/Sampling/Sampler.cs new file mode 100644 index 0000000..1941a70 --- /dev/null +++ b/src/contrib/Facet/Sampling/Sampler.cs @@ -0,0 +1,170 @@ +using Lucene.Net.Facet.Params; +using Lucene.Net.Facet.Search; +using Lucene.Net.Facet.Taxonomy; +using Lucene.Net.Index; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.Sampling +{ + public abstract class Sampler + { + protected readonly SamplingParams samplingParams; + public Sampler() + : this(new SamplingParams()) + { + } + + public Sampler(SamplingParams params_renamed) + { + if (!params_renamed.Validate()) + { + throw new ArgumentException(@"The provided SamplingParams are not valid!!"); + } + + this.samplingParams = params_renamed; + } + + public virtual bool ShouldSample(IScoredDocIDs docIds) + { + return docIds.Size > samplingParams.SamplingThreshold; + } + + public virtual SampleResult GetSampleSet(IScoredDocIDs docids) + { + if (!ShouldSample(docids)) + { + return new SampleResult(docids, 1.0); + } + + int actualSize = docids.Size; + int sampleSetSize = (int)(actualSize * samplingParams.SampleRatio); + sampleSetSize = Math.Max(sampleSetSize, samplingParams.MinSampleSize); + sampleSetSize = Math.Min(sampleSetSize, samplingParams.MaxSampleSize); + return CreateSample(docids, actualSize, sampleSetSize); + } + + protected abstract SampleResult CreateSample(IScoredDocIDs docids, int actualSize, int sampleSetSize); + + public virtual ISampleFixer GetSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader, FacetSearchParams searchParams) + { + return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams); + } + + public sealed class SampleResult + { + public readonly IScoredDocIDs docids; + public readonly double actualSampleRatio; + + internal SampleResult(IScoredDocIDs docids, double actualSampleRatio) + { + this.docids = docids; + this.actualSampleRatio = actualSampleRatio; + } + } + + public SamplingParams SamplingParams + { + get + { + return samplingParams; + } + } + + public virtual FacetResult TrimResult(FacetResult facetResult) + { + double overSampleFactor = SamplingParams.OversampleFactor; + if (overSampleFactor <= 1) + { + return facetResult; + } + + OverSampledFacetRequest sampledFreq = null; + try + { + sampledFreq = (OverSampledFacetRequest)facetResult.FacetRequest; + } + catch (InvalidCastException e) + { + throw new ArgumentException(@"It is only valid to call this method with result obtained for a " + @"facet request created through sampler.overSamlpingSearchParams()", e); + } + + FacetRequest origFrq = sampledFreq.orig; + FacetResultNode trimmedRootNode = facetResult.FacetResultNode; + TrimSubResults(trimmedRootNode, origFrq.numResults); + return new FacetResult(origFrq, trimmedRootNode, facetResult.NumValidDescendants); + } + + private void TrimSubResults(FacetResultNode node, int size) + { + if (node.subResults == FacetResultNode.EMPTY_SUB_RESULTS || node.subResults.Count == 0) + { + return; + } + + List<FacetResultNode> trimmed = new List<FacetResultNode>(size); + for (int i = 0; i < node.subResults.Count && i < size; i++) + { + FacetResultNode trimmedNode = node.subResults[i]; + TrimSubResults(trimmedNode, size); + trimmed.Add(trimmedNode); + } + + node.subResults = trimmed; + } + + public virtual FacetSearchParams OverSampledSearchParams(FacetSearchParams original) + { + FacetSearchParams res = original; + double overSampleFactor = SamplingParams.OversampleFactor; + if (overSampleFactor > 1) + { + List<FacetRequest> facetRequests = new List<FacetRequest>(); + foreach (FacetRequest frq in original.facetRequests) + { + int overSampledNumResults = (int)Math.Ceiling(frq.numResults * overSampleFactor); + facetRequests.Add(new OverSampledFacetRequest(frq, overSampledNumResults)); + } + + res = new FacetSearchParams(original.indexingParams, facetRequests); + } + + return res; + } + + private class OverSampledFacetRequest : FacetRequest + { + internal readonly FacetRequest orig; + + public OverSampledFacetRequest(FacetRequest orig, int num) + : base(orig.categoryPath, num) + { + this.orig = orig; + Depth = orig.Depth; + NumLabel = orig.NumLabel; + ResultModeValue = orig.ResultModeValue; + SortOrderValue = orig.SortOrderValue; + } + + public override IAggregator CreateAggregator(bool useComplements, FacetArrays arrays, TaxonomyReader taxonomy) + { + return orig.CreateAggregator(useComplements, arrays, taxonomy); + } + + public override FacetArraysSource FacetArraysSourceValue + { + get + { + return orig.FacetArraysSourceValue; + } + } + + public override double GetValueOf(FacetArrays arrays, int idx) + { + return orig.GetValueOf(arrays, idx); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/SamplingAccumulator.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Sampling/SamplingAccumulator.cs b/src/contrib/Facet/Sampling/SamplingAccumulator.cs new file mode 100644 index 0000000..8aefb4b --- /dev/null +++ b/src/contrib/Facet/Sampling/SamplingAccumulator.cs @@ -0,0 +1,72 @@ +using Lucene.Net.Facet.Params; +using Lucene.Net.Facet.Partitions; +using Lucene.Net.Facet.Search; +using Lucene.Net.Facet.Taxonomy; +using Lucene.Net.Index; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.Sampling +{ + public class SamplingAccumulator : StandardFacetsAccumulator + { + private double samplingRatio = -1.0; + private readonly Sampler sampler; + + public SamplingAccumulator(Sampler sampler, FacetSearchParams searchParams, IndexReader indexReader, TaxonomyReader taxonomyReader, FacetArrays facetArrays) + : base(searchParams, indexReader, taxonomyReader, facetArrays) + { + this.sampler = sampler; + } + + public SamplingAccumulator(Sampler sampler, FacetSearchParams searchParams, IndexReader indexReader, TaxonomyReader taxonomyReader) + : base(searchParams, indexReader, taxonomyReader) + { + this.sampler = sampler; + } + + public override List<FacetResult> Accumulate(IScoredDocIDs docids) + { + FacetSearchParams original = searchParams; + searchParams = sampler.OverSampledSearchParams(original); + List<FacetResult> sampleRes = base.Accumulate(docids); + List<FacetResult> fixedRes = new List<FacetResult>(); + foreach (FacetResult fres in sampleRes) + { + var freswritable = fres; + + PartitionsFacetResultsHandler frh = (PartitionsFacetResultsHandler)CreateFacetResultsHandler(freswritable.FacetRequest); + sampler.GetSampleFixer(indexReader, taxonomyReader, searchParams).FixResult(docids, freswritable); + freswritable = frh.RearrangeFacetResult(freswritable); + freswritable = sampler.TrimResult(freswritable); + frh.LabelResult(freswritable); + fixedRes.Add(freswritable); + } + + searchParams = original; + return fixedRes; + } + + protected override IScoredDocIDs ActualDocsToAccumulate(IScoredDocIDs docids) + { + Sampler.SampleResult sampleRes = sampler.GetSampleSet(docids); + samplingRatio = sampleRes.actualSampleRatio; + return sampleRes.docids; + } + + protected override double TotalCountsFactor + { + get + { + if (samplingRatio < 0) + { + throw new InvalidOperationException(@"Total counts ratio unavailable because actualDocsToAccumulate() was not invoked"); + } + + return samplingRatio; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/SamplingParams.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Sampling/SamplingParams.cs b/src/contrib/Facet/Sampling/SamplingParams.cs new file mode 100644 index 0000000..7ffc68e --- /dev/null +++ b/src/contrib/Facet/Sampling/SamplingParams.cs @@ -0,0 +1,86 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.Sampling +{ + public class SamplingParams + { + public static readonly double DEFAULT_OVERSAMPLE_FACTOR = 2.0; + public static readonly double DEFAULT_SAMPLE_RATIO = 0.0; + public static readonly int DEFAULT_MAX_SAMPLE_SIZE = 10000; + public static readonly int DEFAULT_MIN_SAMPLE_SIZE = 100; + public static readonly int DEFAULT_SAMPLING_THRESHOLD = 75000; + private int maxSampleSize = DEFAULT_MAX_SAMPLE_SIZE; + private int minSampleSize = DEFAULT_MIN_SAMPLE_SIZE; + private double sampleRatio = DEFAULT_SAMPLE_RATIO; + private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD; + private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR; + + public int MaxSampleSize + { + get + { + return maxSampleSize; + } + set + { + maxSampleSize = value; + } + } + + public int MinSampleSize + { + get + { + return minSampleSize; + } + set + { + minSampleSize = value; + } + } + + public double SampleRatio + { + get + { + return sampleRatio; + } + set + { + sampleRatio = value; + } + } + + public int SamplingThreshold + { + get + { + return samplingThreshold; + } + set + { + samplingThreshold = value; + } + } + + public virtual bool Validate() + { + return samplingThreshold >= maxSampleSize && maxSampleSize >= minSampleSize && sampleRatio > 0 && sampleRatio < 1; + } + + public double OversampleFactor + { + get + { + return oversampleFactor; + } + set + { + oversampleFactor = value; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/SamplingWrapper.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Sampling/SamplingWrapper.cs b/src/contrib/Facet/Sampling/SamplingWrapper.cs new file mode 100644 index 0000000..ce9400a --- /dev/null +++ b/src/contrib/Facet/Sampling/SamplingWrapper.cs @@ -0,0 +1,58 @@ +using Lucene.Net.Facet.Params; +using Lucene.Net.Facet.Partitions; +using Lucene.Net.Facet.Search; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.Sampling +{ + public class SamplingWrapper : StandardFacetsAccumulator + { + private StandardFacetsAccumulator delegee; + private Sampler sampler; + + public SamplingWrapper(StandardFacetsAccumulator delegee, Sampler sampler) + : base(delegee.searchParams, delegee.indexReader, delegee.taxonomyReader) + { + this.delegee = delegee; + this.sampler = sampler; + } + + public override List<FacetResult> Accumulate(IScoredDocIDs docids) + { + FacetSearchParams original = delegee.searchParams; + delegee.searchParams = sampler.OverSampledSearchParams(original); + Sampler.SampleResult sampleSet = sampler.GetSampleSet(docids); + List<FacetResult> sampleRes = delegee.Accumulate(sampleSet.docids); + List<FacetResult> fixedRes = new List<FacetResult>(); + foreach (FacetResult fres in sampleRes) + { + var freswritable = fres; + + PartitionsFacetResultsHandler frh = (PartitionsFacetResultsHandler)CreateFacetResultsHandler(freswritable.FacetRequest); + sampler.GetSampleFixer(indexReader, taxonomyReader, searchParams).FixResult(docids, freswritable); + freswritable = frh.RearrangeFacetResult(freswritable); + freswritable = sampler.TrimResult(freswritable); + frh.LabelResult(freswritable); + fixedRes.Add(freswritable); + } + + delegee.searchParams = original; + return fixedRes; + } + + public override double ComplementThreshold + { + get + { + return delegee.ComplementThreshold; + } + set + { + delegee.ComplementThreshold = value; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Sampling/TakmiSampleFixer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Sampling/TakmiSampleFixer.cs b/src/contrib/Facet/Sampling/TakmiSampleFixer.cs new file mode 100644 index 0000000..1a6af01 --- /dev/null +++ b/src/contrib/Facet/Sampling/TakmiSampleFixer.cs @@ -0,0 +1,125 @@ +using Lucene.Net.Facet.Params; +using Lucene.Net.Facet.Search; +using Lucene.Net.Facet.Taxonomy; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.Sampling +{ + internal class TakmiSampleFixer : ISampleFixer + { + private TaxonomyReader taxonomyReader; + private IndexReader indexReader; + private FacetSearchParams searchParams; + + public TakmiSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader, FacetSearchParams searchParams) + { + this.indexReader = indexReader; + this.taxonomyReader = taxonomyReader; + this.searchParams = searchParams; + } + + public void FixResult(IScoredDocIDs origDocIds, FacetResult fres) + { + FacetResultNode topRes = fres.FacetResultNode; + FixResultNode(topRes, origDocIds); + } + + private void FixResultNode(FacetResultNode facetResNode, IScoredDocIDs docIds) + { + Recount(facetResNode, docIds); + foreach (FacetResultNode frn in facetResNode.subResults) + { + FixResultNode(frn, docIds); + } + } + + private void Recount(FacetResultNode fresNode, IScoredDocIDs docIds) + { + if (fresNode.label == null) + { + fresNode.label = taxonomyReader.GetPath(fresNode.ordinal); + } + + CategoryPath catPath = fresNode.label; + Term drillDownTerm = DrillDownQuery.Term(searchParams.indexingParams, catPath); + IBits liveDocs = MultiFields.GetLiveDocs(indexReader); + int updatedCount = CountIntersection(MultiFields.GetTermDocsEnum(indexReader, liveDocs, drillDownTerm.Field, drillDownTerm.Bytes, 0), docIds.Iterator()); + fresNode.value = updatedCount; + } + + private static int CountIntersection(DocsEnum p1, IScoredDocIDsIterator p2) + { + if (p1 == null || p1.NextDoc() == DocIdSetIterator.NO_MORE_DOCS) + { + return 0; + } + + if (!p2.Next()) + { + return 0; + } + + int d1 = p1.DocID; + int d2 = p2.DocID; + int count = 0; + for (; ; ) + { + if (d1 == d2) + { + ++count; + if (p1.NextDoc() == DocIdSetIterator.NO_MORE_DOCS) + { + break; + } + + d1 = p1.DocID; + if (!Advance(p2, d1)) + { + break; + } + + d2 = p2.DocID; + } + else if (d1 < d2) + { + if (p1.Advance(d2) == DocIdSetIterator.NO_MORE_DOCS) + { + break; + } + + d1 = p1.DocID; + } + else + { + if (!Advance(p2, d1)) + { + break; + } + + d2 = p2.DocID; + } + } + + return count; + } + + private static bool Advance(IScoredDocIDsIterator iterator, int targetDoc) + { + while (iterator.Next()) + { + if (iterator.DocID >= targetDoc) + { + return true; + } + } + + return false; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/contrib/Facet/Search/DrillDownQuery.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Search/DrillDownQuery.cs b/src/contrib/Facet/Search/DrillDownQuery.cs new file mode 100644 index 0000000..a9cf63a --- /dev/null +++ b/src/contrib/Facet/Search/DrillDownQuery.cs @@ -0,0 +1,201 @@ +using Lucene.Net.Facet.Params; +using Lucene.Net.Facet.Taxonomy; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Facet.Search +{ + public sealed class DrillDownQuery : Query + { + public static Term Term(FacetIndexingParams iParams, CategoryPath path) + { + CategoryListParams clp = iParams.GetCategoryListParams(path); + char[] buffer = new char[path.FullPathLength()]; + iParams.DrillDownTermText(path, buffer); + return new Term(clp.field, new string(buffer)); + } + + private readonly BooleanQuery query; + private readonly IDictionary<string, int?> drillDownDims = new HashMap<string, int?>(); + readonly FacetIndexingParams fip; + + internal DrillDownQuery(FacetIndexingParams fip, BooleanQuery query, IDictionary<string, int?> drillDownDims) + { + this.fip = fip; + this.query = (BooleanQuery)query.Clone(); + this.drillDownDims.PutAll(drillDownDims); + } + + internal DrillDownQuery(Filter filter, DrillDownQuery other) + { + query = new BooleanQuery(true); + BooleanClause[] clauses = other.query.Clauses; + if (clauses.Length == other.drillDownDims.Count) + { + throw new ArgumentException(@"cannot apply filter unless baseQuery isn't null; pass ConstantScoreQuery instead"); + } + + drillDownDims.PutAll(other.drillDownDims); + query.Add(new FilteredQuery(clauses[0].Query, filter), Occur.MUST); + for (int i = 1; i < clauses.Length; i++) + { + query.Add(clauses[i].Query, Occur.MUST); + } + + fip = other.fip; + } + + internal DrillDownQuery(FacetIndexingParams fip, Query baseQuery, List<Query> clauses) + { + this.fip = fip; + this.query = new BooleanQuery(true); + if (baseQuery != null) + { + query.Add(baseQuery, Occur.MUST); + } + + foreach (Query clause in clauses) + { + query.Add(clause, Occur.MUST); + drillDownDims[GetDim(clause)] = drillDownDims.Count; + } + } + + internal string GetDim(Query clause) + { + clause = ((ConstantScoreQuery)clause).Query; + string term; + if (clause is TermQuery) + { + term = ((TermQuery)clause).Term.Text; + } + else + { + term = ((TermQuery)((BooleanQuery)clause).Clauses[0].Query).Term.Text; + } + + return term.Split(new[] { Regex.Escape(fip.FacetDelimChar.ToString()) }, StringSplitOptions.None)[0]; + } + + public DrillDownQuery(FacetIndexingParams fip) + : this(fip, null) + { + } + + public DrillDownQuery(FacetIndexingParams fip, Query baseQuery) + { + query = new BooleanQuery(true); + if (baseQuery != null) + { + query.Add(baseQuery, Occur.MUST); + } + + this.fip = fip; + } + + public void Add(params CategoryPath[] paths) + { + Query q; + if (paths[0].length == 0) + { + throw new ArgumentException(@"all CategoryPaths must have length > 0"); + } + + string dim = paths[0].components[0]; + if (drillDownDims.ContainsKey(dim)) + { + throw new ArgumentException(@"dimension '" + dim + @"' was already added"); + } + + if (paths.Length == 1) + { + q = new TermQuery(Term(fip, paths[0])); + } + else + { + BooleanQuery bq = new BooleanQuery(true); + foreach (CategoryPath cp in paths) + { + if (cp.length == 0) + { + throw new ArgumentException(@"all CategoryPaths must have length > 0"); + } + + if (!cp.components[0].Equals(dim)) + { + throw new ArgumentException(@"multiple (OR'd) drill-down paths must be under same dimension; got '" + dim + @"' and '" + cp.components[0] + @"'"); + } + + bq.Add(new TermQuery(Term(fip, cp)), Occur.SHOULD); + } + + q = bq; + } + + drillDownDims[dim] = drillDownDims.Count; + ConstantScoreQuery drillDownQuery = new ConstantScoreQuery(q); + drillDownQuery.Boost = 0F; + query.Add(drillDownQuery, Occur.MUST); + } + + public override object Clone() + { + return new DrillDownQuery(fip, query, drillDownDims); + } + + public override int GetHashCode() + { + int prime = 31; + int result = base.GetHashCode(); + return prime * result + query.GetHashCode(); + } + + public override bool Equals(Object obj) + { + if (!(obj is DrillDownQuery)) + { + return false; + } + + DrillDownQuery other = (DrillDownQuery)obj; + return query.Equals(other.query) && base.Equals(other); + } + + public override Query Rewrite(IndexReader r) + { + if (query.Clauses.Count() == 0) + { + throw new InvalidOperationException(@"no base query or drill-down categories given"); + } + + return query; + } + + public override string ToString(string field) + { + return query.ToString(field); + } + + internal BooleanQuery BooleanQuery + { + get + { + return query; + } + } + + internal IDictionary<string, int?> Dims + { + get + { + return drillDownDims; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/core/Lucene.Net.csproj ---------------------------------------------------------------------- diff --git a/src/core/Lucene.Net.csproj b/src/core/Lucene.Net.csproj index 47713a1..5941972 100644 --- a/src/core/Lucene.Net.csproj +++ b/src/core/Lucene.Net.csproj @@ -880,6 +880,7 @@ <Compile Include="Support\BufferUnderflowException.cs" /> <Compile Include="Support\BuildType.cs" /> <Compile Include="Support\ByteBuffer.cs" /> + <Compile Include="Support\DateTimeExtensions.cs" /> <Compile Include="Support\ICallable.cs" /> <Compile Include="Support\Character.cs" /> <Compile Include="Support\CloseableThreadLocalProfiler.cs" /> @@ -911,6 +912,7 @@ <Compile Include="Support\IdentityHashMap.cs" /> <Compile Include="Support\IdentityHashSet.cs" /> <Compile Include="Support\IdentityWeakReferenceT.cs" /> + <Compile Include="Support\IDictionaryExtensions.cs" /> <Compile Include="Support\IndexedLinkedList.cs" /> <Compile Include="Support\Inflater.cs" /> <Compile Include="Support\IThreadRunnable.cs" /> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/core/Support/DateTimeExtensions.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/DateTimeExtensions.cs b/src/core/Support/DateTimeExtensions.cs new file mode 100644 index 0000000..aebaf32 --- /dev/null +++ b/src/core/Support/DateTimeExtensions.cs @@ -0,0 +1,17 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support +{ + public static class DateTimeExtensions + { + private static readonly DateTime Jan1st1970 = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc); + + public static long CurrentTimeMillis(this DateTime dt) + { + return (long)(dt - Jan1st1970).TotalMilliseconds; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ff4fe045/src/core/Support/IDictionaryExtensions.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/IDictionaryExtensions.cs b/src/core/Support/IDictionaryExtensions.cs new file mode 100644 index 0000000..121c188 --- /dev/null +++ b/src/core/Support/IDictionaryExtensions.cs @@ -0,0 +1,18 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support +{ + public static class IDictionaryExtensions + { + public static void PutAll<TKey, TValue>(this IDictionary<TKey, TValue> dict, IEnumerable<KeyValuePair<TKey, TValue>> kvps) + { + foreach (var kvp in kvps) + { + dict[kvp.Key] = kvp.Value; + } + } + } +}
