http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowCollatedStringComparer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Sandbox/Queries/SlowCollatedStringComparer.cs b/src/contrib/Sandbox/Queries/SlowCollatedStringComparer.cs new file mode 100644 index 0000000..fd380d9 --- /dev/null +++ b/src/contrib/Sandbox/Queries/SlowCollatedStringComparer.cs @@ -0,0 +1,135 @@ +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Sandbox.Queries +{ + public sealed class SlowCollatedStringComparator : FieldComparator<String> + { + private readonly String[] values; + private BinaryDocValues currentDocTerms; + private readonly string field; + readonly StringComparer collator; + private string bottom; + private readonly BytesRef tempBR = new BytesRef(); + + public SlowCollatedStringComparator(int numHits, string field, StringComparer collator) + { + values = new string[numHits]; + this.field = field; + this.collator = collator; + } + + public override int Compare(int slot1, int slot2) + { + string val1 = values[slot1]; + string val2 = values[slot2]; + if (val1 == null) + { + if (val2 == null) + { + return 0; + } + + return -1; + } + else if (val2 == null) + { + return 1; + } + + return collator.Compare(val1, val2); + } + + public override int CompareBottom(int doc) + { + currentDocTerms.Get(doc, tempBR); + string val2 = tempBR.bytes == BinaryDocValues.MISSING ? null : tempBR.Utf8ToString(); + if (bottom == null) + { + if (val2 == null) + { + return 0; + } + + return -1; + } + else if (val2 == null) + { + return 1; + } + + return collator.Compare(bottom, val2); + } + + public override void Copy(int slot, int doc) + { + currentDocTerms.Get(doc, tempBR); + if (tempBR.bytes == BinaryDocValues.MISSING) + { + values[slot] = null; + } + else + { + values[slot] = tempBR.Utf8ToString(); + } + } + + public override FieldComparator SetNextReader(AtomicReaderContext context) + { + currentDocTerms = FieldCache.DEFAULT.GetTerms(context.AtomicReader, field); + return this; + } + + public override void SetBottom(int bottom) + { + this.bottom = values[bottom]; + } + + public override object Value(int slot) + { + return values[slot]; + } + + public override int CompareValues(string first, string second) + { + if (first == null) + { + if (second == null) + { + return 0; + } + + return -1; + } + else if (second == null) + { + return 1; + } + else + { + return collator.Compare(first, second); + } + } + + public override int CompareDocToValue(int doc, string value) + { + currentDocTerms.Get(doc, tempBR); + string docValue; + if (tempBR.bytes == BinaryDocValues.MISSING) + { + docValue = null; + } + else + { + docValue = tempBR.Utf8ToString(); + } + + return CompareValues(docValue, value); + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Sandbox/Queries/SlowCollatedTermRangeFilter.cs b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeFilter.cs new file mode 100644 index 0000000..45b97d8 --- /dev/null +++ b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeFilter.cs @@ -0,0 +1,41 @@ +using Lucene.Net.Search; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Sandbox.Queries +{ + public class SlowCollatedTermRangeFilter : MultiTermQueryWrapperFilter<SlowCollatedTermRangeQuery> + { + public SlowCollatedTermRangeFilter(string fieldName, string lowerTerm, string upperTerm, bool includeLower, bool includeUpper, StringComparer collator) + : base(new SlowCollatedTermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper, collator)) + { + } + + public virtual string GetLowerTerm() + { + return query.GetLowerTerm(); + } + + public virtual string GetUpperTerm() + { + return query.GetUpperTerm(); + } + + public virtual bool IncludesLower() + { + return query.IncludesLower(); + } + + public virtual bool IncludesUpper() + { + return query.IncludesUpper(); + } + + public virtual StringComparer GetCollator() + { + return query.GetCollator(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeQuery.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Sandbox/Queries/SlowCollatedTermRangeQuery.cs b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeQuery.cs new file mode 100644 index 0000000..ff7b2de --- /dev/null +++ b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeQuery.cs @@ -0,0 +1,145 @@ +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Sandbox.Queries +{ + public class SlowCollatedTermRangeQuery : MultiTermQuery + { + private string lowerTerm; + private string upperTerm; + private bool includeLower; + private bool includeUpper; + private StringComparer collator; + + public SlowCollatedTermRangeQuery(string field, string lowerTerm, string upperTerm, bool includeLower, bool includeUpper, StringComparer collator) + : base(field) + { + this.lowerTerm = lowerTerm; + this.upperTerm = upperTerm; + this.includeLower = includeLower; + this.includeUpper = includeUpper; + this.collator = collator; + } + + public virtual string GetLowerTerm() + { + return lowerTerm; + } + + public virtual string GetUpperTerm() + { + return upperTerm; + } + + public virtual bool IncludesLower() + { + return includeLower; + } + + public virtual bool IncludesUpper() + { + return includeUpper; + } + + public virtual StringComparer GetCollator() + { + return collator; + } + + protected override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts) + { + if (lowerTerm != null && upperTerm != null && collator.Compare(lowerTerm, upperTerm) > 0) + { + return TermsEnum.EMPTY; + } + + TermsEnum tenum = terms.Iterator(null); + if (lowerTerm == null && upperTerm == null) + { + return tenum; + } + + return new SlowCollatedTermRangeTermsEnum(tenum, lowerTerm, upperTerm, includeLower, includeUpper, collator); + } + + public override string Field + { + get + { + return base.Field; + } + } + + public override string ToString(string field) + { + StringBuilder buffer = new StringBuilder(); + if (!Field.Equals(field)) + { + buffer.Append(Field); + buffer.Append(@":"); + } + + buffer.Append(includeLower ? '[' : '{'); + buffer.Append(lowerTerm != null ? lowerTerm : @"*"); + buffer.Append(@" TO "); + buffer.Append(upperTerm != null ? upperTerm : @"*"); + buffer.Append(includeUpper ? ']' : '}'); + buffer.Append(ToStringUtils.Boost(Boost)); + return buffer.ToString(); + } + + public override int GetHashCode() + { + int prime = 31; + int result = base.GetHashCode(); + result = prime * result + ((collator == null) ? 0 : collator.GetHashCode()); + result = prime * result + (includeLower ? 1231 : 1237); + result = prime * result + (includeUpper ? 1231 : 1237); + result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.GetHashCode()); + result = prime * result + ((upperTerm == null) ? 0 : upperTerm.GetHashCode()); + return result; + } + + public override bool Equals(Object obj) + { + if (this == obj) + return true; + if (!base.Equals(obj)) + return false; + if (GetType() != obj.GetType()) + return false; + SlowCollatedTermRangeQuery other = (SlowCollatedTermRangeQuery)obj; + if (collator == null) + { + if (other.collator != null) + return false; + } + else if (!collator.Equals(other.collator)) + return false; + if (includeLower != other.includeLower) + return false; + if (includeUpper != other.includeUpper) + return false; + if (lowerTerm == null) + { + if (other.lowerTerm != null) + return false; + } + else if (!lowerTerm.Equals(other.lowerTerm)) + return false; + if (upperTerm == null) + { + if (other.upperTerm != null) + return false; + } + else if (!upperTerm.Equals(other.upperTerm)) + return false; + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeTermsEnum.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Sandbox/Queries/SlowCollatedTermRangeTermsEnum.cs b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeTermsEnum.cs new file mode 100644 index 0000000..a4af5cd --- /dev/null +++ b/src/contrib/Sandbox/Queries/SlowCollatedTermRangeTermsEnum.cs @@ -0,0 +1,46 @@ +using Lucene.Net.Index; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Sandbox.Queries +{ + public class SlowCollatedTermRangeTermsEnum : FilteredTermsEnum + { + private StringComparer collator; + private string upperTermText; + private string lowerTermText; + private bool includeLower; + private bool includeUpper; + + public SlowCollatedTermRangeTermsEnum(TermsEnum tenum, string lowerTermText, string upperTermText, bool includeLower, bool includeUpper, StringComparer collator) + : base(tenum) + { + this.collator = collator; + this.upperTermText = upperTermText; + this.lowerTermText = lowerTermText; + this.includeLower = includeLower; + this.includeUpper = includeUpper; + if (this.lowerTermText == null) + { + this.lowerTermText = @""; + this.includeLower = true; + } + + BytesRef startBytesRef = new BytesRef(""); + InitialSeekTerm = startBytesRef; + } + + protected override AcceptStatus Accept(BytesRef term) + { + if ((includeLower ? collator.Compare(term.Utf8ToString(), lowerTermText) >= 0 : collator.Compare(term.Utf8ToString(), lowerTermText) > 0) && (upperTermText == null || (includeUpper ? collator.Compare(term.Utf8ToString(), upperTermText) <= 0 : collator.Compare(term.Utf8ToString(), upperTermText) < 0))) + { + return AcceptStatus.YES; + } + + return AcceptStatus.NO; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowFuzzyQuery.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Sandbox/Queries/SlowFuzzyQuery.cs b/src/contrib/Sandbox/Queries/SlowFuzzyQuery.cs new file mode 100644 index 0000000..d68ac25 --- /dev/null +++ b/src/contrib/Sandbox/Queries/SlowFuzzyQuery.cs @@ -0,0 +1,136 @@ +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Support; +using Lucene.Net.Util; +using Lucene.Net.Util.Automaton; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Sandbox.Queries +{ + public class SlowFuzzyQuery : MultiTermQuery + { + public static readonly float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; + public static readonly int defaultPrefixLength = 0; + public static readonly int defaultMaxExpansions = 50; + private float minimumSimilarity; + private int prefixLength; + private bool termLongEnough = false; + protected Term term; + + public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength, int maxExpansions) + : base(term.Field) + { + this.term = term; + if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity) + throw new ArgumentException(@"fractional edit distances are not allowed"); + if (minimumSimilarity < 0.0f) + throw new ArgumentException(@"minimumSimilarity < 0"); + if (prefixLength < 0) + throw new ArgumentException(@"prefixLength < 0"); + if (maxExpansions < 0) + throw new ArgumentException(@"maxExpansions < 0"); + SetRewriteMethod(new TopTermsScoringBooleanQueryRewrite(maxExpansions)); + string text = term.Text; + int len = text.Length; + if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) + { + this.termLongEnough = true; + } + + this.minimumSimilarity = minimumSimilarity; + this.prefixLength = prefixLength; + } + + public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) + : this(term, minimumSimilarity, prefixLength, defaultMaxExpansions) + { + } + + public SlowFuzzyQuery(Term term, float minimumSimilarity) + : this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions) + { + } + + public SlowFuzzyQuery(Term term) + : this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions) + { + } + + public virtual float GetMinSimilarity() + { + return minimumSimilarity; + } + + public virtual int GetPrefixLength() + { + return prefixLength; + } + + protected override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts) + { + if (!termLongEnough) + { + return new SingleTermsEnum(terms.Iterator(null), term.Bytes); + } + + return new SlowFuzzyTermsEnum(terms, atts, GetTerm(), minimumSimilarity, prefixLength); + } + + public virtual Term GetTerm() + { + return term; + } + + public override string ToString(string field) + { + StringBuilder buffer = new StringBuilder(); + if (!term.Field.Equals(field)) + { + buffer.Append(term.Field); + buffer.Append(@":"); + } + + buffer.Append(term.Text); + buffer.Append('~'); + buffer.Append(minimumSimilarity); + buffer.Append(ToStringUtils.Boost(Boost)); + return buffer.ToString(); + } + + public override int GetHashCode() + { + int prime = 31; + int result = base.GetHashCode(); + result = prime * result + Number.FloatToIntBits(minimumSimilarity); + result = prime * result + prefixLength; + result = prime * result + ((term == null) ? 0 : term.GetHashCode()); + return result; + } + + public override bool Equals(Object obj) + { + if (this == obj) + return true; + if (!base.Equals(obj)) + return false; + if (GetType() != obj.GetType()) + return false; + SlowFuzzyQuery other = (SlowFuzzyQuery)obj; + if (Number.FloatToIntBits(minimumSimilarity) != Number.FloatToIntBits(other.minimumSimilarity)) + return false; + if (prefixLength != other.prefixLength) + return false; + if (term == null) + { + if (other.term != null) + return false; + } + else if (!term.Equals(other.term)) + return false; + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/contrib/Sandbox/Queries/SlowFuzzyTermsEnum.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Sandbox/Queries/SlowFuzzyTermsEnum.cs b/src/contrib/Sandbox/Queries/SlowFuzzyTermsEnum.cs new file mode 100644 index 0000000..e696d27 --- /dev/null +++ b/src/contrib/Sandbox/Queries/SlowFuzzyTermsEnum.cs @@ -0,0 +1,142 @@ +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Sandbox.Queries +{ + public sealed class SlowFuzzyTermsEnum : FuzzyTermsEnum + { + public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, float minSimilarity, int prefixLength) + : base(terms, atts, term, minSimilarity, prefixLength, false) + { + } + + protected override void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init) + { + TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm); + if (newEnum != null) + { + SetEnum(newEnum); + } + else if (init) + { + SetEnum(new LinearFuzzyTermsEnum(this)); + } + } + + private class LinearFuzzyTermsEnum : FilteredTermsEnum + { + private int[] d; + private int[] p; + private readonly int[] text; + private readonly IBoostAttribute boostAtt; // = Attributes.AddAttribute<IBoostAttribute>(); + + public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum parent) + : base(parent.terms.Iterator(null)) + { + this.parent = parent; + + boostAtt = Attributes.AddAttribute<IBoostAttribute>(); + + this.text = new int[parent.termLength - parent.realPrefixLength]; + Array.Copy(parent.termText, parent.realPrefixLength, text, 0, text.Length); + string prefix = UnicodeUtil.NewString(parent.termText, 0, parent.realPrefixLength); + prefixBytesRef = new BytesRef(prefix); + this.d = new int[this.text.Length + 1]; + this.p = new int[this.text.Length + 1]; + InitialSeekTerm = prefixBytesRef; + } + + private readonly BytesRef prefixBytesRef; + private readonly IntsRef utf32 = new IntsRef(20); + + private readonly SlowFuzzyTermsEnum parent; + + protected override AcceptStatus Accept(BytesRef term) + { + if (StringHelper.StartsWith(term, prefixBytesRef)) + { + UnicodeUtil.UTF8toUTF32(term, utf32); + float similarity = Similarity(utf32.ints, parent.realPrefixLength, utf32.length - parent.realPrefixLength); + if (similarity > parent.minSimilarity) + { + boostAtt.Boost = (similarity - parent.minSimilarity) * parent.scale_factor; + return AcceptStatus.YES; + } + else + return AcceptStatus.NO; + } + else + { + return AcceptStatus.END; + } + } + + private float Similarity(int[] target, int offset, int length) + { + int m = length; + int n = text.Length; + if (n == 0) + { + return parent.realPrefixLength == 0 ? 0.0f : 1.0f - ((float)m / parent.realPrefixLength); + } + + if (m == 0) + { + return parent.realPrefixLength == 0 ? 0.0f : 1.0f - ((float)n / parent.realPrefixLength); + } + + int maxDistance = CalculateMaxDistance(m); + if (maxDistance < Math.Abs(m - n)) + { + return float.NegativeInfinity; + } + + for (int i = 0; i <= n; ++i) + { + p[i] = i; + } + + for (int j = 1; j <= m; ++j) + { + int bestPossibleEditDistance = m; + int t_j = target[offset + j - 1]; + d[0] = j; + for (int i = 1; i <= n; ++i) + { + if (t_j != text[i - 1]) + { + d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1; + } + else + { + d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]); + } + + bestPossibleEditDistance = Math.Min(bestPossibleEditDistance, d[i]); + } + + if (j > maxDistance && bestPossibleEditDistance > maxDistance) + { + return float.NegativeInfinity; + } + + int[] _d = p; + p = d; + d = _d; + } + + return 1.0f - ((float)p[n] / (float)(parent.realPrefixLength + Math.Min(n, m))); + } + + private int CalculateMaxDistance(int m) + { + return parent.raw ? parent.maxEdits : Math.Min(parent.maxEdits, (int)((1 - parent.minSimilarity) * (Math.Min(text.Length, m) + parent.realPrefixLength))); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b43f529b/src/core/Search/FuzzyTermsEnum.cs ---------------------------------------------------------------------- diff --git a/src/core/Search/FuzzyTermsEnum.cs b/src/core/Search/FuzzyTermsEnum.cs index dd680af..e77fe9a 100644 --- a/src/core/Search/FuzzyTermsEnum.cs +++ b/src/core/Search/FuzzyTermsEnum.cs @@ -34,7 +34,7 @@ namespace Lucene.Net.Search /// <p/>Term enumerations are always ordered by Term.compareTo(). Each term in /// the enumeration is greater than all that precede it. /// </summary> - public sealed class FuzzyTermsEnum : TermsEnum + public class FuzzyTermsEnum : TermsEnum { private TermsEnum actualEnum; private IBoostAttribute actualBoostAtt; @@ -185,7 +185,7 @@ namespace Lucene.Net.Search } } - protected void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init) + protected virtual void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init) { TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm); // instead of assert, we do a hard check in case someone uses our enum directly
