Port Facet.SortedSet

Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/4a21b967
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/4a21b967
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/4a21b967

Branch: refs/heads/branch_4x
Commit: 4a21b967e1f38ee03c162f62ef9d299d3185c63a
Parents: ff4fe04
Author: Paul Irwin <[email protected]>
Authored: Wed Nov 6 11:55:37 2013 -0500
Committer: Paul Irwin <[email protected]>
Committed: Wed Nov 6 11:55:37 2013 -0500

----------------------------------------------------------------------
 src/contrib/Facet/Contrib.Facet.csproj          |   3 +
 .../SortedSet/SortedSetDocValuesAccumulator.cs  | 299 +++++++++++++++++++
 .../SortedSet/SortedSetDocValuesFacetFields.cs  |  57 ++++
 .../SortedSet/SortedSetDocValuesReaderState.cs  | 125 ++++++++
 4 files changed, 484 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4a21b967/src/contrib/Facet/Contrib.Facet.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/Contrib.Facet.csproj 
b/src/contrib/Facet/Contrib.Facet.csproj
index 9f15e12..5e1acae 100644
--- a/src/contrib/Facet/Contrib.Facet.csproj
+++ b/src/contrib/Facet/Contrib.Facet.csproj
@@ -121,6 +121,9 @@
     <Compile Include="Search\StandardFacetsAccumulator.cs" />
     <Compile Include="Search\TopKFacetResultsHandler.cs" />
     <Compile Include="Search\TopKInEachNodeHandler.cs" />
+    <Compile Include="SortedSet\SortedSetDocValuesAccumulator.cs" />
+    <Compile Include="SortedSet\SortedSetDocValuesFacetFields.cs" />
+    <Compile Include="SortedSet\SortedSetDocValuesReaderState.cs" />
     <Compile Include="Taxonomy\CategoryPath.cs" />
     <Compile Include="Taxonomy\Directory\Consts.cs" />
     <Compile Include="Taxonomy\Directory\DirectoryTaxonomyReader.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4a21b967/src/contrib/Facet/SortedSet/SortedSetDocValuesAccumulator.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/SortedSet/SortedSetDocValuesAccumulator.cs 
b/src/contrib/Facet/SortedSet/SortedSetDocValuesAccumulator.cs
new file mode 100644
index 0000000..ea208ac
--- /dev/null
+++ b/src/contrib/Facet/SortedSet/SortedSetDocValuesAccumulator.cs
@@ -0,0 +1,299 @@
+using Lucene.Net.Facet.Params;
+using Lucene.Net.Facet.Search;
+using Lucene.Net.Facet.Taxonomy;
+using Lucene.Net.Index;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.SortedSet
+{
+    public class SortedSetDocValuesAccumulator : FacetsAccumulator
+    {
+        readonly SortedSetDocValuesReaderState state;
+        readonly SortedSetDocValues dv;
+        readonly string field;
+
+        public SortedSetDocValuesAccumulator(FacetSearchParams fsp, 
SortedSetDocValuesReaderState state)
+            : base(fsp, null, null, new 
FacetArrays((int)state.DocValues.ValueCount))
+        {
+            this.state = state;
+            this.field = state.Field;
+            dv = state.DocValues;
+
+            foreach (FacetRequest request in fsp.facetRequests)
+            {
+                if (!(request is CountFacetRequest))
+                {
+                    throw new ArgumentException(@"this collector only supports 
CountFacetRequest; got " + request);
+                }
+
+                if (request.categoryPath.length != 1)
+                {
+                    throw new ArgumentException(@"this collector only supports 
depth 1 CategoryPath; got " + request.categoryPath);
+                }
+
+                if (request.Depth != 1)
+                {
+                    throw new ArgumentException(@"this collector only supports 
depth=1; got " + request.Depth);
+                }
+
+                string dim = request.categoryPath.components[0];
+                SortedSetDocValuesReaderState.OrdRange ordRange = 
state.GetOrdRange(dim);
+                if (ordRange == null)
+                {
+                    throw new ArgumentException("dim \"" + dim + "\" does not 
exist");
+                }
+            }
+        }
+
+        public override IFacetsAggregator Aggregator
+        {
+            get
+            {
+                return new AnonymousFacetsAggregator(this);
+            }
+        }
+
+        private sealed class AnonymousFacetsAggregator : IFacetsAggregator
+        {
+            public AnonymousFacetsAggregator(SortedSetDocValuesAccumulator 
parent)
+            {
+                this.parent = parent;
+            }
+
+            private readonly SortedSetDocValuesAccumulator parent;
+
+            public void Aggregate(FacetsCollector.MatchingDocs matchingDocs, 
CategoryListParams clp, FacetArrays facetArrays)
+            {
+                SortedSetDocValues segValues = 
matchingDocs.context.AtomicReader.GetSortedSetDocValues(parent.field);
+                if (segValues == null)
+                {
+                    return;
+                }
+
+                int[] counts = facetArrays.GetIntArray();
+                int maxDoc = matchingDocs.context.AtomicReader.MaxDoc;
+                if (parent.dv is MultiDocValues.MultiSortedSetDocValues)
+                {
+                    MultiDocValues.OrdinalMap ordinalMap = 
((MultiDocValues.MultiSortedSetDocValues)parent.dv).mapping;
+                    int segOrd = matchingDocs.context.ord;
+                    int numSegOrds = (int)segValues.ValueCount;
+                    if (matchingDocs.totalHits < numSegOrds / 10)
+                    {
+                        int doc = 0;
+                        while (doc < maxDoc && (doc = 
matchingDocs.bits.NextSetBit(doc)) != -1)
+                        {
+                            segValues.SetDocument(doc);
+                            int term = (int)segValues.NextOrd();
+                            while (term != SortedSetDocValues.NO_MORE_ORDS)
+                            {
+                                counts[(int)ordinalMap.GetGlobalOrd(segOrd, 
term)]++;
+                                term = (int)segValues.NextOrd();
+                            }
+
+                            ++doc;
+                        }
+                    }
+                    else
+                    {
+                        int[] segCounts = new int[numSegOrds];
+                        int doc = 0;
+                        while (doc < maxDoc && (doc = 
matchingDocs.bits.NextSetBit(doc)) != -1)
+                        {
+                            segValues.SetDocument(doc);
+                            int term = (int)segValues.NextOrd();
+                            while (term != SortedSetDocValues.NO_MORE_ORDS)
+                            {
+                                segCounts[term]++;
+                                term = (int)segValues.NextOrd();
+                            }
+
+                            ++doc;
+                        }
+
+                        for (int ord = 0; ord < numSegOrds; ord++)
+                        {
+                            int count = segCounts[ord];
+                            if (count != 0)
+                            {
+                                counts[(int)ordinalMap.GetGlobalOrd(segOrd, 
ord)] += count;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    int doc = 0;
+                    while (doc < maxDoc && (doc = 
matchingDocs.bits.NextSetBit(doc)) != -1)
+                    {
+                        segValues.SetDocument(doc);
+                        int term = (int)segValues.NextOrd();
+                        while (term != SortedSetDocValues.NO_MORE_ORDS)
+                        {
+                            counts[term]++;
+                            term = (int)segValues.NextOrd();
+                        }
+
+                        ++doc;
+                    }
+                }
+            }
+
+            public void RollupValues(FacetRequest fr, int ordinal, int[] 
children, int[] siblings, FacetArrays facetArrays)
+            {
+            }
+
+            public bool RequiresDocScores
+            {
+                get
+                {
+                    return false;
+                }
+            }
+        }
+
+        class TopCountPQ : Lucene.Net.Util.PriorityQueue<FacetResultNode>
+        {
+            public TopCountPQ(int topN)
+                : base(topN, false)
+            {
+            }
+
+            public override bool LessThan(FacetResultNode a, FacetResultNode b)
+            {
+                if (a.value < b.value)
+                {
+                    return true;
+                }
+                else if (a.value > b.value)
+                {
+                    return false;
+                }
+                else
+                {
+                    return a.ordinal > b.ordinal;
+                }
+            }
+        }
+
+        public override List<FacetResult> 
Accumulate(List<FacetsCollector.MatchingDocs> matchingDocs)
+        {
+            IFacetsAggregator aggregator = Aggregator;
+            foreach (CategoryListParams clp in GetCategoryLists())
+            {
+                foreach (FacetsCollector.MatchingDocs md in matchingDocs)
+                {
+                    aggregator.Aggregate(md, clp, facetArrays);
+                }
+            }
+
+            List<FacetResult> results = new List<FacetResult>();
+            int[] counts = facetArrays.GetIntArray();
+            BytesRef scratch = new BytesRef();
+            foreach (FacetRequest request in searchParams.facetRequests)
+            {
+                string dim = request.categoryPath.components[0];
+                SortedSetDocValuesReaderState.OrdRange ordRange = 
state.GetOrdRange(dim);
+                if (request.numResults >= ordRange.end - ordRange.start + 1)
+                {
+                    List<FacetResultNode> nodes = new List<FacetResultNode>();
+                    int dimCount = 0;
+                    for (int ord = ordRange.start; ord <= ordRange.end; ord++)
+                    {
+                        if (counts[ord] != 0)
+                        {
+                            dimCount += counts[ord];
+                            FacetResultNode node = new FacetResultNode(ord, 
counts[ord]);
+                            dv.LookupOrd(ord, scratch);
+                            node.label = new 
CategoryPath(scratch.Utf8ToString().Split(new[] { state.separatorRegex }, 
StringSplitOptions.None));
+                            nodes.Add(node);
+                        }
+                    }
+
+                    nodes.Sort(new AnonymousComparator());
+                    CategoryListParams.OrdinalPolicy op = 
searchParams.indexingParams.GetCategoryListParams(request.categoryPath).GetOrdinalPolicy(dim);
+                    if (op == 
CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION)
+                    {
+                        dimCount = 0;
+                    }
+
+                    FacetResultNode rootNode = new FacetResultNode(-1, 
dimCount);
+                    rootNode.label = new CategoryPath(new string[] { dim } );
+                    rootNode.subResults = nodes;
+                    results.Add(new FacetResult(request, rootNode, 
nodes.Count));
+                    continue;
+                }
+
+                TopCountPQ q = new TopCountPQ(request.numResults);
+                int bottomCount = 0;
+                int dimCount2 = 0;
+                int childCount = 0;
+                FacetResultNode reuse = null;
+                for (int ord = ordRange.start; ord <= ordRange.end; ord++)
+                {
+                    if (counts[ord] > 0)
+                    {
+                        childCount++;
+                        if (counts[ord] > bottomCount)
+                        {
+                            dimCount2 += counts[ord];
+                            if (reuse == null)
+                            {
+                                reuse = new FacetResultNode(ord, counts[ord]);
+                            }
+                            else
+                            {
+                                reuse.ordinal = ord;
+                                reuse.value = counts[ord];
+                            }
+
+                            reuse = q.InsertWithOverflow(reuse);
+                            if (q.Size == request.numResults)
+                            {
+                                bottomCount = (int)q.Top().value;
+                            }
+                        }
+                    }
+                }
+
+                CategoryListParams.OrdinalPolicy op2 = 
searchParams.indexingParams.GetCategoryListParams(request.categoryPath).GetOrdinalPolicy(dim);
+                if (op2 == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION)
+                {
+                    dimCount2 = 0;
+                }
+
+                FacetResultNode rootNode2 = new FacetResultNode(-1, dimCount2);
+                rootNode2.label = new CategoryPath(new string[] { dim } );
+                FacetResultNode[] childNodes = new FacetResultNode[q.Size];
+                for (int i = childNodes.Length - 1; i >= 0; i--)
+                {
+                    childNodes[i] = q.Pop();
+                    dv.LookupOrd(childNodes[i].ordinal, scratch);
+                    childNodes[i].label = new 
CategoryPath(scratch.Utf8ToString().Split(new[] { state.separatorRegex }, 
StringSplitOptions.None));
+                }
+
+                rootNode2.subResults = childNodes;
+                results.Add(new FacetResult(request, rootNode2, childCount));
+            }
+
+            return results;
+        }
+
+        private sealed class AnonymousComparator : IComparer<FacetResultNode>
+        {
+            public int Compare(FacetResultNode o1, FacetResultNode o2)
+            {
+                int value = (int)(o2.value - o1.value);
+                if (value == 0)
+                {
+                    value = o1.ordinal - o2.ordinal;
+                }
+
+                return value;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4a21b967/src/contrib/Facet/SortedSet/SortedSetDocValuesFacetFields.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/SortedSet/SortedSetDocValuesFacetFields.cs 
b/src/contrib/Facet/SortedSet/SortedSetDocValuesFacetFields.cs
new file mode 100644
index 0000000..bc7f0c9
--- /dev/null
+++ b/src/contrib/Facet/SortedSet/SortedSetDocValuesFacetFields.cs
@@ -0,0 +1,57 @@
+using Lucene.Net.Documents;
+using Lucene.Net.Facet.Index;
+using Lucene.Net.Facet.Params;
+using Lucene.Net.Facet.Taxonomy;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Facet.SortedSet
+{
+    public class SortedSetDocValuesFacetFields : FacetFields
+    {
+        public SortedSetDocValuesFacetFields()
+            : this(FacetIndexingParams.DEFAULT)
+        {
+        }
+
+        public SortedSetDocValuesFacetFields(FacetIndexingParams fip)
+            : base(null, fip)
+        {
+            if (fip.PartitionSize != int.MaxValue)
+            {
+                throw new ArgumentException(@"partitions are not supported");
+            }
+        }
+
+        public override void AddFields(Document doc, IEnumerable<CategoryPath> 
categories)
+        {
+            if (categories == null)
+            {
+                throw new ArgumentException(@"categories should not be null");
+            }
+
+            IDictionary<CategoryListParams, IEnumerable<CategoryPath>> 
categoryLists = CreateCategoryListMapping(categories);
+            foreach (KeyValuePair<CategoryListParams, 
IEnumerable<CategoryPath>> e in categoryLists)
+            {
+                CategoryListParams clp = e.Key;
+                string dvField = clp.field + 
SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION;
+                foreach (CategoryPath cp in e.Value)
+                {
+                    if (cp.length != 2)
+                    {
+                        throw new ArgumentException(@"only flat facets 
(dimension + label) are currently supported; got " + cp);
+                    }
+
+                    doc.Add(new SortedSetDocValuesField(dvField, new 
BytesRef(cp.ToString(indexingParams.FacetDelimChar))));
+                }
+
+                DrillDownStream drillDownStream = GetDrillDownStream(e.Value);
+                Field drillDown = new Field(clp.field, drillDownStream, 
DrillDownFieldType());
+                doc.Add(drillDown);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4a21b967/src/contrib/Facet/SortedSet/SortedSetDocValuesReaderState.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Facet/SortedSet/SortedSetDocValuesReaderState.cs 
b/src/contrib/Facet/SortedSet/SortedSetDocValuesReaderState.cs
new file mode 100644
index 0000000..2e79a67
--- /dev/null
+++ b/src/contrib/Facet/SortedSet/SortedSetDocValuesReaderState.cs
@@ -0,0 +1,125 @@
+using Lucene.Net.Facet.Params;
+using Lucene.Net.Index;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Facet.SortedSet
+{
+    public sealed class SortedSetDocValuesReaderState
+    {
+        private readonly string field;
+        private readonly AtomicReader topReader;
+        private readonly int valueCount;
+        internal readonly char separator;
+        internal readonly string separatorRegex;
+        public static readonly string FACET_FIELD_EXTENSION = 
@"_sorted_doc_values";
+
+        internal sealed class OrdRange
+        {
+            public readonly int start;
+            public readonly int end;
+            public OrdRange(int start, int end)
+            {
+                this.start = start;
+                this.end = end;
+            }
+        }
+
+        private readonly IDictionary<String, OrdRange> prefixToOrdRange = new 
HashMap<String, OrdRange>();
+
+        public SortedSetDocValuesReaderState(IndexReader reader)
+            : this(FacetIndexingParams.DEFAULT, reader)
+        {
+        }
+
+        public SortedSetDocValuesReaderState(FacetIndexingParams fip, 
IndexReader reader)
+        {
+            this.field = fip.GetCategoryListParams(null).field + 
FACET_FIELD_EXTENSION;
+            this.separator = fip.FacetDelimChar;
+            this.separatorRegex = Regex.Escape(separator.ToString());
+            if (reader is AtomicReader)
+            {
+                topReader = (AtomicReader)reader;
+            }
+            else
+            {
+                topReader = new 
SlowCompositeReaderWrapper((CompositeReader)reader);
+            }
+
+            SortedSetDocValues dv = topReader.GetSortedSetDocValues(field);
+            if (dv == null)
+            {
+                throw new ArgumentException("field \"" + field + "\" was not 
indexed with SortedSetDocValues");
+            }
+
+            if (dv.ValueCount > int.MaxValue)
+            {
+                throw new ArgumentException(@"can only handle valueCount < 
Integer.MAX_VALUE; got " + dv.ValueCount);
+            }
+
+            valueCount = (int)dv.ValueCount;
+            string lastDim = null;
+            int startOrd = -1;
+            BytesRef spare = new BytesRef();
+            for (int ord = 0; ord < valueCount; ord++)
+            {
+                dv.LookupOrd(ord, spare);
+                String[] components = spare.Utf8ToString().Split(new[] { 
separatorRegex}, StringSplitOptions.None);
+                if (components.Length != 2)
+                {
+                    throw new ArgumentException(@"this class can only handle 2 
level hierarchy (dim/value); got: " + spare.Utf8ToString());
+                }
+
+                if (!components[0].Equals(lastDim))
+                {
+                    if (lastDim != null)
+                    {
+                        prefixToOrdRange[lastDim] = new OrdRange(startOrd, ord 
- 1);
+                    }
+
+                    startOrd = ord;
+                    lastDim = components[0];
+                }
+            }
+
+            if (lastDim != null)
+            {
+                prefixToOrdRange[lastDim] = new OrdRange(startOrd, valueCount 
- 1);
+            }
+        }
+
+        internal SortedSetDocValues DocValues
+        {
+            get
+            {
+                return topReader.GetSortedSetDocValues(field);
+            }
+        }
+
+        internal OrdRange GetOrdRange(string dim)
+        {
+            return prefixToOrdRange[dim];
+        }
+
+        internal string Field
+        {
+            get
+            {
+                return field;
+            }
+        }
+
+        internal int Size
+        {
+            get
+            {
+                return valueCount;
+            }
+        }
+    }
+}

Reply via email to