Port Facet.SortedSet
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/4a21b967 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/4a21b967 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/4a21b967 Branch: refs/heads/branch_4x Commit: 4a21b967e1f38ee03c162f62ef9d299d3185c63a Parents: ff4fe04 Author: Paul Irwin <[email protected]> Authored: Wed Nov 6 11:55:37 2013 -0500 Committer: Paul Irwin <[email protected]> Committed: Wed Nov 6 11:55:37 2013 -0500 ---------------------------------------------------------------------- src/contrib/Facet/Contrib.Facet.csproj | 3 + .../SortedSet/SortedSetDocValuesAccumulator.cs | 299 +++++++++++++++++++ .../SortedSet/SortedSetDocValuesFacetFields.cs | 57 ++++ .../SortedSet/SortedSetDocValuesReaderState.cs | 125 ++++++++ 4 files changed, 484 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4a21b967/src/contrib/Facet/Contrib.Facet.csproj ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/Contrib.Facet.csproj b/src/contrib/Facet/Contrib.Facet.csproj index 9f15e12..5e1acae 100644 --- a/src/contrib/Facet/Contrib.Facet.csproj +++ b/src/contrib/Facet/Contrib.Facet.csproj @@ -121,6 +121,9 @@ <Compile Include="Search\StandardFacetsAccumulator.cs" /> <Compile Include="Search\TopKFacetResultsHandler.cs" /> <Compile Include="Search\TopKInEachNodeHandler.cs" /> + <Compile Include="SortedSet\SortedSetDocValuesAccumulator.cs" /> + <Compile Include="SortedSet\SortedSetDocValuesFacetFields.cs" /> + <Compile Include="SortedSet\SortedSetDocValuesReaderState.cs" /> <Compile Include="Taxonomy\CategoryPath.cs" /> <Compile Include="Taxonomy\Directory\Consts.cs" /> <Compile Include="Taxonomy\Directory\DirectoryTaxonomyReader.cs" /> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4a21b967/src/contrib/Facet/SortedSet/SortedSetDocValuesAccumulator.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/SortedSet/SortedSetDocValuesAccumulator.cs b/src/contrib/Facet/SortedSet/SortedSetDocValuesAccumulator.cs new file mode 100644 index 0000000..ea208ac --- /dev/null +++ b/src/contrib/Facet/SortedSet/SortedSetDocValuesAccumulator.cs @@ -0,0 +1,299 @@ +using Lucene.Net.Facet.Params; +using Lucene.Net.Facet.Search; +using Lucene.Net.Facet.Taxonomy; +using Lucene.Net.Index; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.SortedSet +{ + public class SortedSetDocValuesAccumulator : FacetsAccumulator + { + readonly SortedSetDocValuesReaderState state; + readonly SortedSetDocValues dv; + readonly string field; + + public SortedSetDocValuesAccumulator(FacetSearchParams fsp, SortedSetDocValuesReaderState state) + : base(fsp, null, null, new FacetArrays((int)state.DocValues.ValueCount)) + { + this.state = state; + this.field = state.Field; + dv = state.DocValues; + + foreach (FacetRequest request in fsp.facetRequests) + { + if (!(request is CountFacetRequest)) + { + throw new ArgumentException(@"this collector only supports CountFacetRequest; got " + request); + } + + if (request.categoryPath.length != 1) + { + throw new ArgumentException(@"this collector only supports depth 1 CategoryPath; got " + request.categoryPath); + } + + if (request.Depth != 1) + { + throw new ArgumentException(@"this collector only supports depth=1; got " + request.Depth); + } + + string dim = request.categoryPath.components[0]; + SortedSetDocValuesReaderState.OrdRange ordRange = state.GetOrdRange(dim); + if (ordRange == null) + { + throw new ArgumentException("dim \"" + dim + "\" does not exist"); + } + } + } + + public override IFacetsAggregator Aggregator + { + get + { + return new AnonymousFacetsAggregator(this); + } + } + + private sealed class AnonymousFacetsAggregator : IFacetsAggregator + { + public AnonymousFacetsAggregator(SortedSetDocValuesAccumulator parent) + { + this.parent = parent; + } + + private readonly SortedSetDocValuesAccumulator parent; + + public void Aggregate(FacetsCollector.MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) + { + SortedSetDocValues segValues = matchingDocs.context.AtomicReader.GetSortedSetDocValues(parent.field); + if (segValues == null) + { + return; + } + + int[] counts = facetArrays.GetIntArray(); + int maxDoc = matchingDocs.context.AtomicReader.MaxDoc; + if (parent.dv is MultiDocValues.MultiSortedSetDocValues) + { + MultiDocValues.OrdinalMap ordinalMap = ((MultiDocValues.MultiSortedSetDocValues)parent.dv).mapping; + int segOrd = matchingDocs.context.ord; + int numSegOrds = (int)segValues.ValueCount; + if (matchingDocs.totalHits < numSegOrds / 10) + { + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.NextSetBit(doc)) != -1) + { + segValues.SetDocument(doc); + int term = (int)segValues.NextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) + { + counts[(int)ordinalMap.GetGlobalOrd(segOrd, term)]++; + term = (int)segValues.NextOrd(); + } + + ++doc; + } + } + else + { + int[] segCounts = new int[numSegOrds]; + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.NextSetBit(doc)) != -1) + { + segValues.SetDocument(doc); + int term = (int)segValues.NextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) + { + segCounts[term]++; + term = (int)segValues.NextOrd(); + } + + ++doc; + } + + for (int ord = 0; ord < numSegOrds; ord++) + { + int count = segCounts[ord]; + if (count != 0) + { + counts[(int)ordinalMap.GetGlobalOrd(segOrd, ord)] += count; + } + } + } + } + else + { + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.NextSetBit(doc)) != -1) + { + segValues.SetDocument(doc); + int term = (int)segValues.NextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) + { + counts[term]++; + term = (int)segValues.NextOrd(); + } + + ++doc; + } + } + } + + public void RollupValues(FacetRequest fr, int ordinal, int[] children, int[] siblings, FacetArrays facetArrays) + { + } + + public bool RequiresDocScores + { + get + { + return false; + } + } + } + + class TopCountPQ : Lucene.Net.Util.PriorityQueue<FacetResultNode> + { + public TopCountPQ(int topN) + : base(topN, false) + { + } + + public override bool LessThan(FacetResultNode a, FacetResultNode b) + { + if (a.value < b.value) + { + return true; + } + else if (a.value > b.value) + { + return false; + } + else + { + return a.ordinal > b.ordinal; + } + } + } + + public override List<FacetResult> Accumulate(List<FacetsCollector.MatchingDocs> matchingDocs) + { + IFacetsAggregator aggregator = Aggregator; + foreach (CategoryListParams clp in GetCategoryLists()) + { + foreach (FacetsCollector.MatchingDocs md in matchingDocs) + { + aggregator.Aggregate(md, clp, facetArrays); + } + } + + List<FacetResult> results = new List<FacetResult>(); + int[] counts = facetArrays.GetIntArray(); + BytesRef scratch = new BytesRef(); + foreach (FacetRequest request in searchParams.facetRequests) + { + string dim = request.categoryPath.components[0]; + SortedSetDocValuesReaderState.OrdRange ordRange = state.GetOrdRange(dim); + if (request.numResults >= ordRange.end - ordRange.start + 1) + { + List<FacetResultNode> nodes = new List<FacetResultNode>(); + int dimCount = 0; + for (int ord = ordRange.start; ord <= ordRange.end; ord++) + { + if (counts[ord] != 0) + { + dimCount += counts[ord]; + FacetResultNode node = new FacetResultNode(ord, counts[ord]); + dv.LookupOrd(ord, scratch); + node.label = new CategoryPath(scratch.Utf8ToString().Split(new[] { state.separatorRegex }, StringSplitOptions.None)); + nodes.Add(node); + } + } + + nodes.Sort(new AnonymousComparator()); + CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.GetCategoryListParams(request.categoryPath).GetOrdinalPolicy(dim); + if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) + { + dimCount = 0; + } + + FacetResultNode rootNode = new FacetResultNode(-1, dimCount); + rootNode.label = new CategoryPath(new string[] { dim } ); + rootNode.subResults = nodes; + results.Add(new FacetResult(request, rootNode, nodes.Count)); + continue; + } + + TopCountPQ q = new TopCountPQ(request.numResults); + int bottomCount = 0; + int dimCount2 = 0; + int childCount = 0; + FacetResultNode reuse = null; + for (int ord = ordRange.start; ord <= ordRange.end; ord++) + { + if (counts[ord] > 0) + { + childCount++; + if (counts[ord] > bottomCount) + { + dimCount2 += counts[ord]; + if (reuse == null) + { + reuse = new FacetResultNode(ord, counts[ord]); + } + else + { + reuse.ordinal = ord; + reuse.value = counts[ord]; + } + + reuse = q.InsertWithOverflow(reuse); + if (q.Size == request.numResults) + { + bottomCount = (int)q.Top().value; + } + } + } + } + + CategoryListParams.OrdinalPolicy op2 = searchParams.indexingParams.GetCategoryListParams(request.categoryPath).GetOrdinalPolicy(dim); + if (op2 == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) + { + dimCount2 = 0; + } + + FacetResultNode rootNode2 = new FacetResultNode(-1, dimCount2); + rootNode2.label = new CategoryPath(new string[] { dim } ); + FacetResultNode[] childNodes = new FacetResultNode[q.Size]; + for (int i = childNodes.Length - 1; i >= 0; i--) + { + childNodes[i] = q.Pop(); + dv.LookupOrd(childNodes[i].ordinal, scratch); + childNodes[i].label = new CategoryPath(scratch.Utf8ToString().Split(new[] { state.separatorRegex }, StringSplitOptions.None)); + } + + rootNode2.subResults = childNodes; + results.Add(new FacetResult(request, rootNode2, childCount)); + } + + return results; + } + + private sealed class AnonymousComparator : IComparer<FacetResultNode> + { + public int Compare(FacetResultNode o1, FacetResultNode o2) + { + int value = (int)(o2.value - o1.value); + if (value == 0) + { + value = o1.ordinal - o2.ordinal; + } + + return value; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4a21b967/src/contrib/Facet/SortedSet/SortedSetDocValuesFacetFields.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/SortedSet/SortedSetDocValuesFacetFields.cs b/src/contrib/Facet/SortedSet/SortedSetDocValuesFacetFields.cs new file mode 100644 index 0000000..bc7f0c9 --- /dev/null +++ b/src/contrib/Facet/SortedSet/SortedSetDocValuesFacetFields.cs @@ -0,0 +1,57 @@ +using Lucene.Net.Documents; +using Lucene.Net.Facet.Index; +using Lucene.Net.Facet.Params; +using Lucene.Net.Facet.Taxonomy; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Facet.SortedSet +{ + public class SortedSetDocValuesFacetFields : FacetFields + { + public SortedSetDocValuesFacetFields() + : this(FacetIndexingParams.DEFAULT) + { + } + + public SortedSetDocValuesFacetFields(FacetIndexingParams fip) + : base(null, fip) + { + if (fip.PartitionSize != int.MaxValue) + { + throw new ArgumentException(@"partitions are not supported"); + } + } + + public override void AddFields(Document doc, IEnumerable<CategoryPath> categories) + { + if (categories == null) + { + throw new ArgumentException(@"categories should not be null"); + } + + IDictionary<CategoryListParams, IEnumerable<CategoryPath>> categoryLists = CreateCategoryListMapping(categories); + foreach (KeyValuePair<CategoryListParams, IEnumerable<CategoryPath>> e in categoryLists) + { + CategoryListParams clp = e.Key; + string dvField = clp.field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION; + foreach (CategoryPath cp in e.Value) + { + if (cp.length != 2) + { + throw new ArgumentException(@"only flat facets (dimension + label) are currently supported; got " + cp); + } + + doc.Add(new SortedSetDocValuesField(dvField, new BytesRef(cp.ToString(indexingParams.FacetDelimChar)))); + } + + DrillDownStream drillDownStream = GetDrillDownStream(e.Value); + Field drillDown = new Field(clp.field, drillDownStream, DrillDownFieldType()); + doc.Add(drillDown); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4a21b967/src/contrib/Facet/SortedSet/SortedSetDocValuesReaderState.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Facet/SortedSet/SortedSetDocValuesReaderState.cs b/src/contrib/Facet/SortedSet/SortedSetDocValuesReaderState.cs new file mode 100644 index 0000000..2e79a67 --- /dev/null +++ b/src/contrib/Facet/SortedSet/SortedSetDocValuesReaderState.cs @@ -0,0 +1,125 @@ +using Lucene.Net.Facet.Params; +using Lucene.Net.Index; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Facet.SortedSet +{ + public sealed class SortedSetDocValuesReaderState + { + private readonly string field; + private readonly AtomicReader topReader; + private readonly int valueCount; + internal readonly char separator; + internal readonly string separatorRegex; + public static readonly string FACET_FIELD_EXTENSION = @"_sorted_doc_values"; + + internal sealed class OrdRange + { + public readonly int start; + public readonly int end; + public OrdRange(int start, int end) + { + this.start = start; + this.end = end; + } + } + + private readonly IDictionary<String, OrdRange> prefixToOrdRange = new HashMap<String, OrdRange>(); + + public SortedSetDocValuesReaderState(IndexReader reader) + : this(FacetIndexingParams.DEFAULT, reader) + { + } + + public SortedSetDocValuesReaderState(FacetIndexingParams fip, IndexReader reader) + { + this.field = fip.GetCategoryListParams(null).field + FACET_FIELD_EXTENSION; + this.separator = fip.FacetDelimChar; + this.separatorRegex = Regex.Escape(separator.ToString()); + if (reader is AtomicReader) + { + topReader = (AtomicReader)reader; + } + else + { + topReader = new SlowCompositeReaderWrapper((CompositeReader)reader); + } + + SortedSetDocValues dv = topReader.GetSortedSetDocValues(field); + if (dv == null) + { + throw new ArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues"); + } + + if (dv.ValueCount > int.MaxValue) + { + throw new ArgumentException(@"can only handle valueCount < Integer.MAX_VALUE; got " + dv.ValueCount); + } + + valueCount = (int)dv.ValueCount; + string lastDim = null; + int startOrd = -1; + BytesRef spare = new BytesRef(); + for (int ord = 0; ord < valueCount; ord++) + { + dv.LookupOrd(ord, spare); + String[] components = spare.Utf8ToString().Split(new[] { separatorRegex}, StringSplitOptions.None); + if (components.Length != 2) + { + throw new ArgumentException(@"this class can only handle 2 level hierarchy (dim/value); got: " + spare.Utf8ToString()); + } + + if (!components[0].Equals(lastDim)) + { + if (lastDim != null) + { + prefixToOrdRange[lastDim] = new OrdRange(startOrd, ord - 1); + } + + startOrd = ord; + lastDim = components[0]; + } + } + + if (lastDim != null) + { + prefixToOrdRange[lastDim] = new OrdRange(startOrd, valueCount - 1); + } + } + + internal SortedSetDocValues DocValues + { + get + { + return topReader.GetSortedSetDocValues(field); + } + } + + internal OrdRange GetOrdRange(string dim) + { + return prefixToOrdRange[dim]; + } + + internal string Field + { + get + { + return field; + } + } + + internal int Size + { + get + { + return valueCount; + } + } + } +}
