http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationAttributeFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationAttributeFactory.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationAttributeFactory.cs new file mode 100644 index 0000000..ed4c7f9 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationAttributeFactory.cs @@ -0,0 +1,75 @@ +using Icu.Collation; +using Lucene.Net.Collation.TokenAttributes; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System.Reflection; + +namespace Lucene.Net.Collation +{ + /// <summary> + /// Converts each token into its <see cref="System.Globalization.SortKey"/>, and + /// then encodes bytes as an index term. + /// </summary> + /// <remarks> + /// <strong>WARNING:</strong> Make sure you use exactly the same <see cref="Collator"/> at + /// index and query time -- <see cref="System.Globalization.SortKey"/>s are only comparable when produced by + /// the same <see cref="Collator"/>. <see cref="RuleBasedCollator"/>s are + /// independently versioned, so it is safe to search against stored + /// <see cref="System.Globalization.SortKey"/>s if the following are exactly the same (best practice is + /// to store this information with the index and check that they remain the + /// same at query time): + /// <para/> + /// <list type="number"> + /// <item><description>Collator version - see <see cref="Collator"/> Version</description></item> + /// <item><description>The collation strength used - see <see cref="Collator.Strength"/></description></item> + /// </list> + /// <para/> + /// <see cref="System.Globalization.SortKey"/>s generated by ICU Collators are not compatible with those + /// generated by java.text.Collators. Specifically, if you use + /// <see cref="ICUCollationAttributeFactory"/> to generate index terms, do not use + /// CollationAttributeFactory on the query side, or vice versa. + /// <para/> + /// <see cref="ICUCollationAttributeFactory"/> is significantly faster and generates significantly + /// shorter keys than CollationAttributeFactory. See + /// <a href="http://site.icu-project.org/charts/collation-icu4j-sun" + /// >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key + /// generation timing and key length comparisons between ICU4J and + /// java.text.Collator over several languages. + /// </remarks> + [ExceptionToClassNameConvention] + public class ICUCollationAttributeFactory : AttributeSource.AttributeFactory + { + private readonly Collator collator; + private readonly AttributeSource.AttributeFactory @delegate; + + /// <summary> + /// Create an <see cref="ICUCollationAttributeFactory"/>, using + /// <see cref="AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY"/> as the + /// factory for all other attributes. + /// </summary> + /// <param name="collator"><see cref="System.Globalization.SortKey"/> generator</param> + public ICUCollationAttributeFactory(Collator collator) + : this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator) + { + } + + /// <summary> + /// Create an <see cref="ICUCollationAttributeFactory"/>, using the supplied Attribute + /// Factory as the factory for all other attributes. + /// </summary> + /// <param name="delegate">Attribute Factory</param> + /// <param name="collator"><see cref="System.Globalization.SortKey"/> generator</param> + public ICUCollationAttributeFactory(AttributeSource.AttributeFactory @delegate, Collator collator) + { + this.@delegate = @delegate; + this.collator = collator; + } + + public override Util.Attribute CreateAttributeInstance<T>() + { + return typeof(T).GetTypeInfo().IsAssignableFrom(typeof(ICUCollatedTermAttribute)) + ? new ICUCollatedTermAttribute(collator) + : @delegate.CreateAttributeInstance<T>(); + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationDocValuesField.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationDocValuesField.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationDocValuesField.cs new file mode 100644 index 0000000..bddc095 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationDocValuesField.cs @@ -0,0 +1,62 @@ +using Icu.Collation; +using Lucene.Net.Documents; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System.Globalization; + +namespace Lucene.Net.Collation +{ + /// <summary> + /// Indexes sort keys as a single-valued <see cref="SortedDocValuesField"/>. + /// </summary> + /// <remarks> + /// This is more efficient that <see cref="ICUCollationKeyAnalyzer"/> if the field + /// only has one value: no uninversion is necessary to sort on the field, + /// locale-sensitive range queries can still work via <see cref="Search.FieldCacheRangeFilter"/>, + /// and the underlying data structures built at index-time are likely more efficient + /// and use less memory than FieldCache. + /// </remarks> + [ExceptionToClassNameConvention] + public sealed class ICUCollationDocValuesField : Field + { + private readonly string name; + private readonly Collator collator; + private readonly BytesRef bytes = new BytesRef(); + private SortKey key; + + /// <summary> + /// Create a new <see cref="ICUCollationDocValuesField"/>. + /// <para/> + /// NOTE: you should not create a new one for each document, instead + /// just make one and reuse it during your indexing process, setting + /// the value via <see cref="SetStringValue(string)"/>. + /// </summary> + /// <param name="name">Field name.</param> + /// <param name="collator">Collator for generating collation keys.</param> + // TODO: can we make this trap-free? maybe just synchronize on the collator + // instead? + public ICUCollationDocValuesField(string name, Collator collator) + : base(name, SortedDocValuesField.TYPE) + { + this.name = name; + this.collator = (Collator)collator.Clone(); + m_fieldsData = bytes; // so wrong setters cannot be called + } + + public override string Name + { + get + { + return name; + } + } + + public override void SetStringValue(string value) + { + key = collator.GetSortKey(value); + bytes.Bytes = key.KeyData; + bytes.Offset = 0; + bytes.Length = key.KeyData.Length; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyAnalyzer.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyAnalyzer.cs new file mode 100644 index 0000000..3b9d7c4 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyAnalyzer.cs @@ -0,0 +1,96 @@ +using Icu.Collation; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Core; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.IO; + +namespace Lucene.Net.Collation +{ + /// <summary> + /// Configures <see cref="KeywordTokenizer"/> with <see cref="ICUCollationAttributeFactory"/>. + /// </summary> + /// <remarks> + /// Converts the token into its <see cref="System.Globalization.SortKey"/>, and + /// then encodes the <see cref="System.Globalization.SortKey"/> either directly or with + /// <see cref="IndexableBinaryStringTools"/> (see <a href="#version">below</a>), to allow it to + /// be stored as an index term. + /// <para/> + /// <strong>WARNING:</strong> Make sure you use exactly the same <see cref="Collator"/> at + /// index and query time -- CollationKeys are only comparable when produced by + /// the same <see cref="Collator"/>. <see cref="RuleBasedCollator"/>s are + /// independently versioned, so it is safe to search against stored + /// <see cref="System.Globalization.SortKey"/>s if the following are exactly the same (best practice is + /// to store this information with the index and check that they remain the + /// same at query time): + /// <list type="number"> + /// <item><description>Collator version - see <see cref="Collator"/> Version</description></item> + /// <item><description>The collation strength used - see <see cref="Collator.Strength"/></description></item> + /// </list> + /// <para/> + /// <see cref="System.Globalization.SortKey"/>s generated by ICU Collators are not compatible with those + /// generated by java.text.Collators. Specifically, if you use + /// <see cref="ICUCollationKeyAnalyzer"/> to generate index terms, do not use + /// CollationKeyAnalyzer on the query side, or vice versa. + /// <para/> + /// ICUCollationKeyAnalyzer is significantly faster and generates significantly + /// shorter keys than CollationKeyAnalyzer. See + /// <a href="http://site.icu-project.org/charts/collation-icu4j-sun" + /// >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key + /// generation timing and key length comparisons between ICU4J and + /// java.text.Collator over several languages. + /// <para/> + /// <a name="version"/> + /// You must specify the required <see cref="LuceneVersion"/> + /// compatibility when creating <see cref="ICUCollationKeyAnalyzer"/>: + /// <list type="bullet"> + /// <item><description>As of 4.0, <see cref="System.Globalization.SortKey"/>s are directly encoded as bytes. Previous + /// versions will encode the bytes with <see cref="IndexableBinaryStringTools"/>.</description></item> + /// </list> + /// </remarks> + [ExceptionToClassNameConvention] + public sealed class ICUCollationKeyAnalyzer : Analyzer + { + private readonly Collator collator; + private readonly ICUCollationAttributeFactory factory; + private readonly LuceneVersion matchVersion; + + /// <summary> + /// Create a new <see cref="ICUCollationKeyAnalyzer"/>, using the specified <paramref name="collator"/>. + /// </summary> + /// <param name="matchVersion">See <see cref="ICUCollationKeyAnalyzer"/>.</param> + /// <param name="collator"><see cref="System.Globalization.SortKey"/> generator.</param> + public ICUCollationKeyAnalyzer(LuceneVersion matchVersion, Collator collator) + { + this.matchVersion = matchVersion; + this.collator = collator; + this.factory = new ICUCollationAttributeFactory(collator); + } + + [Obsolete("Use ICUCollationKeyAnalyzer.ICUCollationKeyAnalyzer(LuceneVersion, Collator) and specify a version instead. This ctor will be removed in Lucene 5.0")] + public ICUCollationKeyAnalyzer(Collator collator) + : this(LuceneVersion.LUCENE_31, collator) + { + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { +#pragma warning disable 612, 618 + if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40)) +#pragma warning restore 612, 618 + { + KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE); + return new TokenStreamComponents(tokenizer, tokenizer); + } + else + { + KeywordTokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, +#pragma warning disable 612, 618 + new ICUCollationKeyFilter(tokenizer, collator)); +#pragma warning restore 612, 618 + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilter.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilter.cs new file mode 100644 index 0000000..e6c595a --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilter.cs @@ -0,0 +1,86 @@ +using Icu.Collation; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Globalization; + +namespace Lucene.Net.Collation +{ + /// <summary> + /// Converts each token into its <see cref="SortKey"/>, and + /// then encodes the <see cref="SortKey"/> with <see cref="IndexableBinaryStringTools"/>, to + /// allow it to be stored as an index term. + /// </summary> + /// <remarks> + /// <strong>WARNING:</strong> Make sure you use exactly the same <see cref="Collator"/> at + /// index and query time -- CollationKeys are only comparable when produced by + /// the same <see cref="Collator"/>. <see cref="RuleBasedCollator"/>s are + /// independently versioned, so it is safe to search against stored + /// <see cref="System.Globalization.SortKey"/>s if the following are exactly the same (best practice is + /// to store this information with the index and check that they remain the + /// same at query time): + /// <list type="number"> + /// <item><description>Collator version - see <see cref="Collator"/> Version</description></item> + /// <item><description>The collation strength used - see <see cref="Collator.Strength"/></description></item> + /// </list> + /// <para/> + /// <see cref="System.Globalization.SortKey"/>s generated by ICU Collators are not compatible with those + /// generated by java.text.Collators. Specifically, if you use + /// <see cref="ICUCollationKeyAnalyzer"/> to generate index terms, do not use + /// CollationKeyAnalyzer on the query side, or vice versa. + /// <para/> + /// ICUCollationKeyAnalyzer is significantly faster and generates significantly + /// shorter keys than CollationKeyAnalyzer. See + /// <a href="http://site.icu-project.org/charts/collation-icu4j-sun" + /// >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key + /// generation timing and key length comparisons between ICU4J and + /// java.text.Collator over several languages. + /// </remarks> + [Obsolete("Use ICUCollationAttributeFactory instead, which encodes terms directly as bytes. This filter will be removed in Lucene 5.0")] + [ExceptionToClassNameConvention] + public sealed class ICUCollationKeyFilter : TokenFilter + { + private Collator collator = null; + private SortKey reusableKey; + private readonly ICharTermAttribute termAtt; + + /// <summary> + /// Creates a new <see cref="ICUCollationKeyFilter"/>. + /// </summary> + /// <param name="input">Source token stream.</param> + /// <param name="collator"><see cref="SortKey"/> generator.</param> + public ICUCollationKeyFilter(TokenStream input, Collator collator) + : base(input) + { + // clone the collator: see http://userguide.icu-project.org/collation/architecture + this.collator = (Collator)collator.Clone(); + this.termAtt = AddAttribute<ICharTermAttribute>(); + } + + public override bool IncrementToken() + { + if (m_input.IncrementToken()) + { + char[] termBuffer = termAtt.Buffer; + string termText = new string(termBuffer, 0, termAtt.Length); + reusableKey = collator.GetSortKey(termText); + int encodedLength = IndexableBinaryStringTools.GetEncodedLength( + reusableKey.KeyData, 0, reusableKey.KeyData.Length); + if (encodedLength > termBuffer.Length) + { + termAtt.ResizeBuffer(encodedLength); + } + termAtt.SetLength(encodedLength); + IndexableBinaryStringTools.Encode(reusableKey.KeyData, 0, reusableKey.KeyData.Length, + termAtt.Buffer, 0, encodedLength); + return true; + } + else + { + return false; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilterFactory.cs new file mode 100644 index 0000000..7ecf357 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Collation/ICUCollationKeyFilterFactory.cs @@ -0,0 +1,245 @@ +using Icu.Collation; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Collation +{ + /// <summary> + /// Factory for <see cref="ICUCollationKeyFilter"/>. + /// </summary> + /// <remarks> + /// This factory can be created in two ways: + /// <list type="bullet"> + /// <item><description>Based upon a system collator associated with a Locale.</description></item> + /// <item><description>Based upon a tailored ruleset.</description></item> + /// </list> + /// <para/> + /// Using a System collator: + /// <list type="bullet"> + /// <item><description>locale: RFC 3066 locale ID (mandatory)</description></item> + /// <item><description>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)</description></item> + /// <item><description>decomposition: 'no', or 'canonical' (optional)</description></item> + /// </list> + /// <para/> + /// Using a Tailored ruleset: + /// <list type="bullet"> + /// <item><description>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)</description></item> + /// <item><description>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)</description></item> + /// <item><description>decomposition: 'no' or 'canonical' (optional)</description></item> + /// </list> + /// <para/> + /// Expert options: + /// <list type="bullet"> + /// <item><description>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.</description></item> + /// <item><description>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.</description></item> + /// <item><description>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.</description></item> + /// <item><description>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10</description></item> + /// </list> + /// </remarks> + /// <seealso cref="Collator"/> + /// <seealso cref="RuleBasedCollator"/> + /// LUCENENET NOTE: variableTop is not supported by icu.net + [Obsolete("Use ICUCollationKeyAnalyzer instead.")] + [ExceptionToClassNameConvention] + public class ICUCollationKeyFilterFactory : TokenFilterFactory, IMultiTermAwareComponent, IResourceLoaderAware + { + private Collator collator; + private readonly string custom; + private readonly string localeID; + private readonly string strength; + private readonly string decomposition; + + private readonly string alternate; + private readonly string caseLevel; + private readonly string caseFirst; + private readonly string numeric; + //private readonly string variableTop; + + public ICUCollationKeyFilterFactory(IDictionary<string, string> args) + : base(args) + { + custom = Get(args, "custom"); + localeID = Get(args, "locale"); + strength = Get(args, "strength"); + decomposition = Get(args, "decomposition"); + + alternate = Get(args, "alternate"); + caseLevel = Get(args, "caseLevel"); + caseFirst = Get(args, "caseFirst"); + numeric = Get(args, "numeric"); + + // LUCENENET TODO: variableTop is not supported by icu.net. Besides this, + // it is deprecated as of ICU 53 and has been superceded by maxVariable, + // but that feature is also not supported by icu.net at the time of this writing. + //variableTop = Get(args, "variableTop"); + + if (custom == null && localeID == null) + throw new ArgumentException("Either custom or locale is required."); + + if (custom != null && localeID != null) + throw new ArgumentException("Cannot specify both locale and custom. " + + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. " + + "Then save the entire customized ruleset to a file, and use with the custom parameter"); + + if (args.Count != 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public virtual void Inform(IResourceLoader loader) + { + if (localeID != null) + { + // create from a system collator, based on Locale. + collator = CreateFromLocale(localeID); + } + else + { + // create from a custom ruleset + collator = CreateFromRules(custom, loader); + } + + // set the strength flag, otherwise it will be the default. + if (strength != null) + { + if (strength.Equals("primary", StringComparison.OrdinalIgnoreCase)) + collator.Strength = CollationStrength.Primary; + else if (strength.Equals("secondary", StringComparison.OrdinalIgnoreCase)) + collator.Strength = CollationStrength.Secondary; + else if (strength.Equals("tertiary", StringComparison.OrdinalIgnoreCase)) + collator.Strength = CollationStrength.Tertiary; + else if (strength.Equals("quaternary", StringComparison.OrdinalIgnoreCase)) + collator.Strength = CollationStrength.Quaternary; + else if (strength.Equals("identical", StringComparison.OrdinalIgnoreCase)) + collator.Strength = CollationStrength.Identical; + else + throw new ArgumentException("Invalid strength: " + strength); + } + + // set the decomposition flag, otherwise it will be the default. + if (decomposition != null) + { + if (decomposition.Equals("no", StringComparison.OrdinalIgnoreCase)) + collator.NormalizationMode = NormalizationMode.Off; // (Collator.NO_DECOMPOSITION); + else if (decomposition.Equals("canonical", StringComparison.OrdinalIgnoreCase)) + collator.NormalizationMode = NormalizationMode.On; //.setDecomposition(Collator.CANONICAL_DECOMPOSITION); + else + throw new ArgumentException("Invalid decomposition: " + decomposition); + } + + // expert options: concrete subclasses are always a RuleBasedCollator + RuleBasedCollator rbc = (RuleBasedCollator)collator; + if (alternate != null) + { + if (alternate.Equals("shifted", StringComparison.OrdinalIgnoreCase)) + { + rbc.AlternateHandling = AlternateHandling.Shifted;// .setAlternateHandlingShifted(true); + } + else if (alternate.Equals("non-ignorable", StringComparison.OrdinalIgnoreCase)) + { + rbc.AlternateHandling = AlternateHandling.NonIgnorable; //.setAlternateHandlingShifted(false); + } + else + { + throw new ArgumentException("Invalid alternate: " + alternate); + } + } + if (caseLevel != null) + { + rbc.CaseLevel = bool.Parse(caseLevel) ? CaseLevel.On : CaseLevel.Off; // setCaseLevel(Boolean.parseBoolean(caseLevel)); + } + if (caseFirst != null) + { + if (caseFirst.Equals("lower", StringComparison.OrdinalIgnoreCase)) + { + rbc.CaseFirst = CaseFirst.LowerFirst; //.setLowerCaseFirst(true); + } + else if (caseFirst.Equals("upper", StringComparison.OrdinalIgnoreCase)) + { + rbc.CaseFirst = CaseFirst.UpperFirst; //.setUpperCaseFirst(true); + } + else + { + throw new ArgumentException("Invalid caseFirst: " + caseFirst); + } + } + if (numeric != null) + { + rbc.NumericCollation = bool.Parse(numeric) ? NumericCollation.On : NumericCollation.Off; //.setNumericCollation(Boolean.parseBoolean(numeric)); + } + + // LUCENENET TODO: variableTop is not supported by icu.net. Besides this, + // it is deprecated as of ICU 53 and has been superceded by maxVariable, + // but that feature is also not supported by icu.net at the time of this writing. + //if (variableTop != null) + //{ + // rbc.setVariableTop(variableTop); + //} + } + + public override TokenStream Create(TokenStream input) + { + return new ICUCollationKeyFilter(input, collator); + } + + /// <summary> + /// Create a locale from <paramref name="localeID"/>. + /// Then return the appropriate collator for the locale. + /// </summary> + /// <param name="localeID"></param> + /// <returns>The appropriate collator for the locale.</returns> + private Collator CreateFromLocale(string localeID) + { + return Collator.Create(localeID); + } + + /// <summary> + /// Read custom rules from a file, and create a <see cref="RuleBasedCollator"/>. + /// The file cannot support comments, as # might be in the rules! + /// </summary> + private Collator CreateFromRules(string fileName, IResourceLoader loader) + { + Stream input = null; + try + { + input = loader.OpenResource(fileName); + string rules = ToUTF8String(input); + return new RuleBasedCollator(rules); + } + catch (Exception e) + { + // io error or invalid rules + throw new Exception(e.ToString(), e); + } + finally + { + IOUtils.DisposeWhileHandlingException(input); + } + } + + public virtual AbstractAnalysisFactory GetMultiTermComponent() + { + return this; + } + + private string ToUTF8String(Stream input) + { + StringBuilder sb = new StringBuilder(); + char[] buffer = new char[1024]; + TextReader r = IOUtils.GetDecodingReader(input, Encoding.UTF8); + int len = 0; + while ((len = r.Read(buffer, 0, buffer.Length)) > 0) + { + sb.Append(buffer, 0, len); + } + return sb.ToString(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Analysis.ICU/Collation/TokenAttributes/ICUCollatedTermAttributeImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Collation/TokenAttributes/ICUCollatedTermAttributeImpl.cs b/src/Lucene.Net.Analysis.ICU/Collation/TokenAttributes/ICUCollatedTermAttributeImpl.cs new file mode 100644 index 0000000..ac1187e --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Collation/TokenAttributes/ICUCollatedTermAttributeImpl.cs @@ -0,0 +1,39 @@ +using Icu.Collation; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System.Globalization; + +namespace Lucene.Net.Collation.TokenAttributes +{ + /// <summary> + /// Extension of <see cref="CharTermAttribute"/> that encodes the term + /// text as a binary Unicode collation key instead of as UTF-8 bytes. + /// </summary> + [ExceptionToClassNameConvention] + public class ICUCollatedTermAttribute : CharTermAttribute + { + private readonly Collator collator; + //private readonly RawCollationKey key = new RawCollationKey(); + private SortKey key; + + /// <summary> + /// Create a new ICUCollatedTermAttribute + /// </summary> + /// <param name="collator"><see cref="SortKey"/> generator.</param> + public ICUCollatedTermAttribute(Collator collator) + { + // clone the collator: see http://userguide.icu-project.org/collation/architecture + this.collator = (Collator)collator.Clone(); + } + + public override void FillBytesRef() + { + BytesRef bytes = this.BytesRef; + key = collator.GetSortKey(ToString()); + bytes.Bytes = key.KeyData; + bytes.Offset = 0; + bytes.Length = key.KeyData.Length; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj b/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj index b1510b9..079f5c1 100644 --- a/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj +++ b/src/Lucene.Net.ICU/Lucene.Net.ICU.csproj @@ -80,6 +80,24 @@ <Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Util\SegmentingTokenizerBase.cs"> <Link>Analysis\Util\SegmentingTokenizerBase.cs</Link> </Compile> + <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationAttributeFactory.cs"> + <Link>Collation\ICUCollationAttributeFactory.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationDocValuesField.cs"> + <Link>Collation\ICUCollationDocValuesField.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationKeyAnalyzer.cs"> + <Link>Collation\ICUCollationKeyAnalyzer.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationKeyFilter.cs"> + <Link>Collation\ICUCollationKeyFilter.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\ICUCollationKeyFilterFactory.cs"> + <Link>Collation\ICUCollationKeyFilterFactory.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.ICU\Collation\TokenAttributes\ICUCollatedTermAttributeImpl.cs"> + <Link>Collation\TokenAttributes\ICUCollatedTermAttributeImpl.cs</Link> + </Compile> <Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\DefaultPassageFormatter.cs"> <Link>Search\PostingsHighlight\DefaultPassageFormatter.cs</Link> </Compile> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs b/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs index c079a6f..e8c6cdf 100644 --- a/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs +++ b/src/Lucene.Net.TestFramework/Analysis/CollationTestBase.cs @@ -1,10 +1,9 @@ -#if FEATURE_COLLATION using Icu.Collation; using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Documents; using Lucene.Net.Index; using Lucene.Net.Search; -using Lucene.Net.Support; +using Lucene.Net.Support.Threading; using Lucene.Net.Util; using NUnit.Framework; using System; @@ -55,7 +54,7 @@ namespace Lucene.Net.Analysis /// <returns> The encoded collation key for the original String </returns> /// @deprecated only for testing deprecated filters [Obsolete("only for testing deprecated filters")] - protected internal virtual string EncodeCollationKey(sbyte[] keyBits) + protected internal virtual string EncodeCollationKey(byte[] keyBits) { // Ensure that the backing char[] array is large enough to hold the encoded // Binary String @@ -268,7 +267,7 @@ namespace Lucene.Net.Analysis } finally { - IOUtils.CloseWhileHandlingException(priorException, ts); + IOUtils.DisposeWhileHandlingException(priorException, ts); } } @@ -328,7 +327,7 @@ namespace Lucene.Net.Analysis } finally { - IOUtils.CloseWhileHandlingException(priorException, ts); + IOUtils.DisposeWhileHandlingException(priorException, ts); } } } @@ -339,5 +338,4 @@ namespace Lucene.Net.Analysis } } } -} -#endif \ No newline at end of file +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.TestFramework/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.TestFramework/project.json b/src/Lucene.Net.TestFramework/project.json index 45e8d8a..ef35176 100644 --- a/src/Lucene.Net.TestFramework/project.json +++ b/src/Lucene.Net.TestFramework/project.json @@ -27,6 +27,7 @@ } }, "dependencies": { + "icu.net": "54.1.1-alpha", "Lucene.Net.Analysis.Common": "4.8.0", "Lucene.Net.Codecs": "4.8.0", "NUnit": "3.5.0" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/Segmentation/TestCharArrayIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/Segmentation/TestCharArrayIterator.cs b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/Segmentation/TestCharArrayIterator.cs new file mode 100644 index 0000000..cccd20a --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/Segmentation/TestCharArrayIterator.cs @@ -0,0 +1,110 @@ +using Lucene.Net.Support; +using Lucene.Net.Util; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Icu.Segmentation +{ + public class TestCharArrayIterator : LuceneTestCase + { + [Test] + public void TestBasicUsage() + { + CharArrayIterator ci = new CharArrayIterator(); + ci.SetText("testing".toCharArray(), 0, "testing".Length); + assertEquals(0, ci.BeginIndex); + assertEquals(7, ci.EndIndex); + assertEquals(0, ci.Index); + assertEquals('t', ci.Current); + assertEquals('e', ci.Next()); + assertEquals('g', ci.Last()); + assertEquals('n', ci.Previous()); + assertEquals('t', ci.First()); + assertEquals(CharacterIterator.DONE, ci.Previous()); + } + + [Test] + public void TestFirst() + { + CharArrayIterator ci = new CharArrayIterator(); + ci.SetText("testing".toCharArray(), 0, "testing".Length); + ci.Next(); + // Sets the position to getBeginIndex() and returns the character at that position. + assertEquals('t', ci.First()); + assertEquals(ci.BeginIndex, ci.Index); + // or DONE if the text is empty + ci.SetText(new char[] { }, 0, 0); + assertEquals(CharacterIterator.DONE, ci.First()); + } + + [Test] + public void TestLast() + { + CharArrayIterator ci = new CharArrayIterator(); + ci.SetText("testing".toCharArray(), 0, "testing".Length); + // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) + // and returns the character at that position. + assertEquals('g', ci.Last()); + assertEquals(ci.Index, ci.EndIndex - 1); + // or DONE if the text is empty + ci.SetText(new char[] { }, 0, 0); + assertEquals(CharacterIterator.DONE, ci.Last()); + assertEquals(ci.EndIndex, ci.Index); + } + + [Test] + public void TestCurrent() + { + CharArrayIterator ci = new CharArrayIterator(); + // Gets the character at the current position (as returned by getIndex()). + ci.SetText("testing".toCharArray(), 0, "testing".Length); + assertEquals('t', ci.Current); + ci.Last(); + ci.Next(); + // or DONE if the current position is off the end of the text. + assertEquals(CharacterIterator.DONE, ci.Current); + } + + [Test] + public void TestNext() + { + CharArrayIterator ci = new CharArrayIterator(); + ci.SetText("te".toCharArray(), 0, 2); + // Increments the iterator's index by one and returns the character at the new index. + assertEquals('e', ci.Next()); + assertEquals(1, ci.Index); + // or DONE if the new position is off the end of the text range. + assertEquals(CharacterIterator.DONE, ci.Next()); + assertEquals(ci.EndIndex, ci.Index); + } + + [Test] + public void TestSetIndex() + { + CharArrayIterator ci = new CharArrayIterator(); + ci.SetText("test".toCharArray(), 0, "test".Length); + try + { + ci.SetIndex(5); + fail(); + } + catch (Exception e) + { + assertTrue(e is ArgumentException); + } + } + + [Test] + public void TestClone() + { + char[] text = "testing".toCharArray(); + CharArrayIterator ci = new CharArrayIterator(); + ci.SetText(text, 0, text.Length); + ci.Next(); + CharArrayIterator ci2 = (CharArrayIterator)ci.Clone(); + assertEquals(ci.Index, ci2.Index); + assertEquals(ci.Next(), ci2.Next()); + assertEquals(ci.Last(), ci2.Last()); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2Filter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2Filter.cs b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2Filter.cs new file mode 100644 index 0000000..da7cf0f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2Filter.cs @@ -0,0 +1,92 @@ +// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net + +//using Lucene.Net.Analysis.Core; +//using Lucene.Net.Support; +//using NUnit.Framework; +//using System; + +//namespace Lucene.Net.Analysis.ICU +//{ +// /// <summary> +// /// Tests the ICUNormalizer2Filter +// /// </summary> +// public class TestICUNormalizer2Filter : BaseTokenStreamTestCase +// { +// private readonly Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => +// { +// Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); +// return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer)); +// }); + +// [Test] +// public void TestDefaults() +// { +// // case folding +// AssertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" }); + +// // case folding +// AssertAnalyzesTo(a, "RuÃ", new String[] { "russ" }); + +// // case folding +// AssertAnalyzesTo(a, "ÎÎΪÎΣ", new String[] { "μάÏοÏ" }); +// AssertAnalyzesTo(a, "ÎάÏοÏ", new String[] { "μάÏοÏ" }); + +// // supplementary case folding +// AssertAnalyzesTo(a, "ð", new String[] { "ð¾" }); + +// // normalization +// AssertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "Ø·Ù Ø·Ù Ø·Ù " }); + +// // removal of default ignorables +// AssertAnalyzesTo(a, "à¤à¥âष", new String[] { "à¤à¥à¤·" }); +// } + +// [Test] +// public void TestAlternate() +// { +// // Analyzer a = new Analyzer() +// //{ +// // @Override +// // public TokenStreamComponents createComponents(String fieldName, Reader reader) +// //{ +// // Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); +// // return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter( +// // tokenizer, +// // /* specify nfc with decompose to get nfd */ +// // Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE))); +// //} +// // }; + +// Analyzer a = Analysis.Analyzer.NewAnonymous(createComponents: (fieldName, reader) => +// { +// Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); +// return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter( +// tokenizer, +// /* specify nfc with decompose to get nfd */ +// //Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE))); +// new Normalizer2(global::Icu.Normalizer.UNormalizationMode.UNORM_NFD))); // LUCENENET NOTE: "nfc" + "DECOMPOSE" = "UNORM_NFD" +// }); + +// // decompose EAcute into E + combining Acute +// AssertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" }); +// } + +// /** blast some random strings through the analyzer */ +// [Test] +// public void TestRandomStrings() +// { +// CheckRandomData(Random(), a, 1000 * RANDOM_MULTIPLIER); +// } + +// [Test] +// public void TestEmptyTerm() +// { +// Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => +// { +// Tokenizer tokenizer = new KeywordTokenizer(reader); +// return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer)); +// }); +// CheckOneTerm(a, "", ""); +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2FilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2FilterFactory.cs b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2FilterFactory.cs new file mode 100644 index 0000000..8ee65a1 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.ICU/Analysis/Icu/TestICUNormalizer2FilterFactory.cs @@ -0,0 +1,45 @@ +// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net + +//using NUnit.Framework; +//using System; +//using System.Collections.Generic; +//using System.IO; + +//namespace Lucene.Net.Analysis.ICU +//{ +// /// <summary> +// /// basic tests for <see cref="ICUNormalizer2FilterFactory"/> +// /// </summary> +// public class TestICUNormalizer2FilterFactory : BaseTokenStreamTestCase +// { +// /** Test nfkc_cf defaults */ +// [Test] +// public void TestDefaults() +// { +// TextReader reader = new StringReader("This is a ï¼´ï½ ï½ï½"); +// ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory(new Dictionary<String, String>()); +// TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); +// stream = factory.Create(stream); +// AssertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" }); +// } + +// /** Test that bogus arguments result in exception */ +// [Test] +// public void TestBogusArguments() +// { +// try +// { +// new ICUNormalizer2FilterFactory(new Dictionary<String, String>() { +// { "bogusArg", "bogusValue" } +// }); +// fail(); +// } +// catch (ArgumentException expected) +// { +// assertTrue(expected.Message.Contains("Unknown parameters")); +// } +// } + +// // TODO: add tests for different forms +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationDocValuesField.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationDocValuesField.cs b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationDocValuesField.cs new file mode 100644 index 0000000..ecfbdf6 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationDocValuesField.cs @@ -0,0 +1,121 @@ +using Icu.Collation; +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.Globalization; + +namespace Lucene.Net.Collation +{ + /// <summary> + /// trivial test of ICUCollationDocValuesField + /// </summary> + [SuppressCodecs("Lucene3x")] + public class TestICUCollationDocValuesField : LuceneTestCase + { + [Test] + public void TestBasic() + { + Directory dir = NewDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + Field field = NewField("field", "", StringField.TYPE_STORED); + ICUCollationDocValuesField collationField = new ICUCollationDocValuesField("collated", Collator.Create(new CultureInfo("en"))); + doc.Add(field); + doc.Add(collationField); + + field.SetStringValue("ABC"); + collationField.SetStringValue("ABC"); + iw.AddDocument(doc); + + field.SetStringValue("abc"); + collationField.SetStringValue("abc"); + iw.AddDocument(doc); + + IndexReader ir = iw.Reader; + iw.Dispose(); + + IndexSearcher @is = NewSearcher(ir); + + SortField sortField = new SortField("collated", SortFieldType.STRING); + + TopDocs td = @is.Search(new MatchAllDocsQuery(), 5, new Sort(sortField)); + assertEquals("abc", ir.Document(td.ScoreDocs[0].Doc).Get("field")); + assertEquals("ABC", ir.Document(td.ScoreDocs[1].Doc).Get("field")); + ir.Dispose(); + dir.Dispose(); + } + + [Test] + public void TestRanges() + { + Directory dir = NewDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); + Document doc = new Document(); + Field field = NewField("field", "", StringField.TYPE_STORED); + Collator collator = Collator.Create(CultureInfo.CurrentCulture, Collator.Fallback.FallbackAllowed); // uses -Dtests.locale + if (Random().nextBoolean()) + { + collator.Strength = CollationStrength.Primary; + } + ICUCollationDocValuesField collationField = new ICUCollationDocValuesField("collated", collator); + doc.Add(field); + doc.Add(collationField); + + int numDocs = AtLeast(500); + for (int i = 0; i < numDocs; i++) + { + String value = TestUtil.RandomSimpleString(Random()); + field.SetStringValue(value); + collationField.SetStringValue(value); + iw.AddDocument(doc); + } + + IndexReader ir = iw.Reader; + iw.Dispose(); + IndexSearcher @is = NewSearcher(ir); + + int numChecks = AtLeast(100); + for (int i = 0; i < numChecks; i++) + { + String start = TestUtil.RandomSimpleString(Random()); + String end = TestUtil.RandomSimpleString(Random()); + BytesRef lowerVal = new BytesRef(collator.GetSortKey(start).KeyData); + BytesRef upperVal = new BytesRef(collator.GetSortKey(end).KeyData); + Query query = new ConstantScoreQuery(FieldCacheRangeFilter.NewBytesRefRange("collated", lowerVal, upperVal, true, true)); + DoTestRanges(@is, start, end, query, collator); + } + + ir.Dispose(); + dir.Dispose(); + } + + private void DoTestRanges(IndexSearcher @is, String startPoint, String endPoint, Query query, Collator collator) + { + QueryUtils.Check(query); + + // positive test + TopDocs docs = @is.Search(query, @is.IndexReader.MaxDoc); + foreach (ScoreDoc doc in docs.ScoreDocs) + { + String value = @is.Doc(doc.Doc).Get("field"); + assertTrue(collator.Compare(value, startPoint) >= 0); + assertTrue(collator.Compare(value, endPoint) <= 0); + } + + // negative test + BooleanQuery bq = new BooleanQuery(); + bq.Add(new MatchAllDocsQuery(), Occur.SHOULD); + bq.Add(query, Occur.MUST_NOT); + docs = @is.Search(bq, @is.IndexReader.MaxDoc); + foreach (ScoreDoc doc in docs.ScoreDocs) + { + String value = @is.Doc(doc.Doc).Get("field"); + assertTrue(collator.Compare(value, startPoint) < 0 || collator.Compare(value, endPoint) > 0); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyAnalyzer.cs b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyAnalyzer.cs new file mode 100644 index 0000000..55b0b3b --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyAnalyzer.cs @@ -0,0 +1,98 @@ +using Icu.Collation; +using Lucene.Net.Analysis; +using Lucene.Net.Util; +using NUnit.Framework; +using System.Globalization; + +namespace Lucene.Net.Collation +{ + [SuppressCodecs("Lucene3x")] + public class TestICUCollationKeyAnalyzer : CollationTestBase + { + private Collator collator = Collator.Create(new CultureInfo("fa")); + private Analyzer analyzer; + + private BytesRef firstRangeBeginning; + private BytesRef firstRangeEnd; + private BytesRef secondRangeBeginning; + private BytesRef secondRangeEnd; + + public override void SetUp() + { + base.SetUp(); + + this.analyzer = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator); + this.firstRangeBeginning = new BytesRef + (collator.GetSortKey(FirstRangeBeginningOriginal).KeyData); + this.firstRangeEnd = new BytesRef + (collator.GetSortKey(FirstRangeEndOriginal).KeyData); + this.secondRangeBeginning = new BytesRef + (collator.GetSortKey(SecondRangeBeginningOriginal).KeyData); + this.secondRangeEnd = new BytesRef + (collator.GetSortKey(SecondRangeEndOriginal).KeyData); + } + + [Test] + public void TestFarsiRangeFilterCollating() + { + TestFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + [Test] + public void TestFarsiRangeQueryCollating() + { + TestFarsiRangeQueryCollating(analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + [Test] + public void TestFarsiTermRangeQuery() + { + TestFarsiTermRangeQuery + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + // Test using various international locales with accented characters (which + // sort differently depending on locale) + // + // Copied (and slightly modified) from + // org.apache.lucene.search.TestSort.testInternationalSort() + // + [Test] + public void TestCollationKeySort() + { + Analyzer usAnalyzer = new ICUCollationKeyAnalyzer + (TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("en-us"), Collator.Fallback.FallbackAllowed)); + + Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer + (TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("fr"))); + + Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer + (TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("sv-se"), Collator.Fallback.FallbackAllowed)); + + Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer + (TEST_VERSION_CURRENT, Collator.Create(new CultureInfo("da-dk"), Collator.Fallback.FallbackAllowed)); + + // The ICU Collator and java.text.Collator implementations differ in their + // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.ROOT. + TestCollationKeySort + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, + "BFJHD", "ECAGI", "BJDFH", "BJDHF"); + } + + [Test] + public void TestThreadSafe() + { + int iters = 20 * RANDOM_MULTIPLIER; + for (int i = 0; i < iters; i++) + { + CultureInfo locale = new CultureInfo("de"); + Collator collator = Collator.Create(locale); + collator.Strength = CollationStrength.Identical; + AssertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator)); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilter.cs b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilter.cs new file mode 100644 index 0000000..a8a8cba --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilter.cs @@ -0,0 +1,101 @@ +using Icu.Collation; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Core; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.Globalization; +using System.IO; + +namespace Lucene.Net.Collation +{ + [Obsolete("remove this when ICUCollationKeyFilter is removed")] + public class TestICUCollationKeyFilter : CollationTestBase + { + private Collator collator = Collator.Create(new CultureInfo("fa")); + private Analyzer analyzer; + + private BytesRef firstRangeBeginning; + private BytesRef firstRangeEnd; + private BytesRef secondRangeBeginning; + private BytesRef secondRangeEnd; + + + public override void SetUp() + { + base.SetUp(); + + this.analyzer = new TestAnalyzer(collator); + this.firstRangeBeginning = new BytesRef(EncodeCollationKey + (collator.GetSortKey(FirstRangeBeginningOriginal).KeyData)); + this.firstRangeEnd = new BytesRef(EncodeCollationKey + (collator.GetSortKey(FirstRangeEndOriginal).KeyData)); + this.secondRangeBeginning = new BytesRef(EncodeCollationKey + (collator.GetSortKey(SecondRangeBeginningOriginal).KeyData)); + this.secondRangeEnd = new BytesRef(EncodeCollationKey + (collator.GetSortKey(SecondRangeEndOriginal).KeyData)); + } + + public sealed class TestAnalyzer : Analyzer + { + private Collator _collator; + + internal TestAnalyzer(Collator collator) + { + _collator = collator; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer result = new KeywordTokenizer(reader); + return new TokenStreamComponents(result, new ICUCollationKeyFilter(result, _collator)); + } + } + + [Test] + public void TestFarsiRangeFilterCollating() + { + TestFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + [Test] + public void TestFarsiRangeQueryCollating() + { + TestFarsiRangeQueryCollating(analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + [Test] + public void TestFarsiTermRangeQuery() + { + TestFarsiTermRangeQuery + (analyzer, firstRangeBeginning, firstRangeEnd, + secondRangeBeginning, secondRangeEnd); + } + + // Test using various international locales with accented characters (which + // sort differently depending on locale) + // + // Copied (and slightly modified) from + // org.apache.lucene.search.TestSort.testInternationalSort() + // + [Test] + public void TestCollationKeySort() + { + Analyzer usAnalyzer = new TestAnalyzer(Collator.Create(new CultureInfo("en-us"), Collator.Fallback.FallbackAllowed)); + Analyzer franceAnalyzer + = new TestAnalyzer(Collator.Create(new CultureInfo("fr"))); + Analyzer swedenAnalyzer + = new TestAnalyzer(Collator.Create(new CultureInfo("sv-se"), Collator.Fallback.FallbackAllowed)); + Analyzer denmarkAnalyzer + = new TestAnalyzer(Collator.Create(new CultureInfo("da-dk"), Collator.Fallback.FallbackAllowed)); + + // The ICU Collator and java.text.Collator implementations differ in their + // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US. + TestCollationKeySort + (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, + "BFJHD", "ECAGI", "BJDFH", "BJDHF"); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilterFactory.cs new file mode 100644 index 0000000..80aa910 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.ICU/Collation/TestICUCollationKeyFilterFactory.cs @@ -0,0 +1,331 @@ +using Icu.Collation; +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Support; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; +using System.Reflection; +using System.Text; + +namespace Lucene.Net.Collation +{ + [Obsolete] + public class TestICUCollationKeyFilterFactory : BaseTokenStreamTestCase + { + /// <summary> + /// Turkish has some funny casing. + /// This test shows how you can solve this kind of thing easily with collation. + /// Instead of using LowerCaseFilter, use a turkish collator with primary strength. + /// Then things will sort and match correctly. + /// </summary> + [Test] + public void TestBasicUsage() + { + String turkishUpperCase = "I WİLL USE TURKİSH CASING"; + String turkishLowerCase = "ı will use turkish casıng"; + TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", + "locale", "tr", + "strength", "primary"); + TokenStream tsUpper = factory.Create( + new KeywordTokenizer(new StringReader(turkishUpperCase))); + TokenStream tsLower = factory.Create( + new KeywordTokenizer(new StringReader(turkishLowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Test usage of the decomposition option for unicode normalization. + */ + [Test] + public void TestNormalization() + { + String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; + String turkishLowerCase = "ı will use turkish casıng"; + TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", + "locale", "tr", + "strength", "primary", + "decomposition", "canonical"); + TokenStream tsUpper = factory.Create( + new KeywordTokenizer(new StringReader(turkishUpperCase))); + TokenStream tsLower = factory.Create( + new KeywordTokenizer(new StringReader(turkishLowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Test secondary strength, for english case is not significant. + */ + [Test] + public void TestSecondaryStrength() + { + String upperCase = "TESTING"; + String lowerCase = "testing"; + TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", + "locale", "en", + "strength", "secondary", + "decomposition", "no"); + TokenStream tsUpper = factory.Create( + new KeywordTokenizer(new StringReader(upperCase))); + TokenStream tsLower = factory.Create( + new KeywordTokenizer(new StringReader(lowerCase))); + assertCollatesToSame(tsUpper, tsLower); + } + + /* + * Setting alternate=shifted to shift whitespace, punctuation and symbols + * to quaternary level + */ + [Test] + public void TestIgnorePunctuation() + { + String withPunctuation = "foo-bar"; + String withoutPunctuation = "foo bar"; + TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", + "locale", "en", + "strength", "primary", + "alternate", "shifted"); + TokenStream tsPunctuation = factory.Create( + new KeywordTokenizer(new StringReader(withPunctuation))); + TokenStream tsWithoutPunctuation = factory.Create( + new KeywordTokenizer(new StringReader(withoutPunctuation))); + assertCollatesToSame(tsPunctuation, tsWithoutPunctuation); + } + + // LUCENENET TODO: variableTop is not supported by icu.net. Besides this, + // it is deprecated as of ICU 53 and has been superceded by maxVariable, + // but that feature is also not supported by icu.net at the time of this writing. + + ///* + // * Setting alternate=shifted and variableTop to shift whitespace, but not + // * punctuation or symbols, to quaternary level + // */ + //[Test] + //public void TestIgnoreWhitespace() + //{ + // String withSpace = "foo bar"; + // String withoutSpace = "foobar"; + // String withPunctuation = "foo-bar"; + // TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", + // "locale", "en", + // "strength", "primary", + // "alternate", "shifted", + // "variableTop", " "); + // TokenStream tsWithSpace = factory.Create( + // new KeywordTokenizer(new StringReader(withSpace))); + // TokenStream tsWithoutSpace = factory.Create( + // new KeywordTokenizer(new StringReader(withoutSpace))); + // assertCollatesToSame(tsWithSpace, tsWithoutSpace); + // // now assert that punctuation still matters: foo-bar < foo bar + // tsWithSpace = factory.Create( + // new KeywordTokenizer(new StringReader(withSpace))); + // TokenStream tsWithPunctuation = factory.Create( + // new KeywordTokenizer(new StringReader(withPunctuation))); + // assertCollation(tsWithPunctuation, tsWithSpace, -1); + //} + + /* + * Setting numeric to encode digits with numeric value, so that + * foobar-9 sorts before foobar-10 + */ + [Test] + public void TestNumerics() + { + String nine = "foobar-9"; + String ten = "foobar-10"; + TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", + "locale", "en", + "numeric", "true"); + TokenStream tsNine = factory.Create( + new KeywordTokenizer(new StringReader(nine))); + TokenStream tsTen = factory.Create( + new KeywordTokenizer(new StringReader(ten))); + assertCollation(tsNine, tsTen, -1); + } + + /* + * Setting caseLevel=true to create an additional case level between + * secondary and tertiary + */ + [Test] + public void TestIgnoreAccentsButNotCase() + { + String withAccents = "résumé"; + String withoutAccents = "resume"; + String withAccentsUpperCase = "Résumé"; + String withoutAccentsUpperCase = "Resume"; + TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", + "locale", "en", + "strength", "primary", + "caseLevel", "true"); + TokenStream tsWithAccents = factory.Create( + new KeywordTokenizer(new StringReader(withAccents))); + TokenStream tsWithoutAccents = factory.Create( + new KeywordTokenizer(new StringReader(withoutAccents))); + assertCollatesToSame(tsWithAccents, tsWithoutAccents); + + TokenStream tsWithAccentsUpperCase = factory.Create( + new KeywordTokenizer(new StringReader(withAccentsUpperCase))); + TokenStream tsWithoutAccentsUpperCase = factory.Create( + new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); + assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase); + + // now assert that case still matters: resume < Resume + TokenStream tsLower = factory.Create( + new KeywordTokenizer(new StringReader(withoutAccents))); + TokenStream tsUpper = factory.Create( + new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); + assertCollation(tsLower, tsUpper, -1); + } + + /* + * Setting caseFirst=upper to cause uppercase strings to sort + * before lowercase ones. + */ + [Test] + public void TestUpperCaseFirst() + { + String lower = "resume"; + String upper = "Resume"; + TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", + "locale", "en", + "strength", "tertiary", + "caseFirst", "upper"); + TokenStream tsLower = factory.Create( + new KeywordTokenizer(new StringReader(lower))); + TokenStream tsUpper = factory.Create( + new KeywordTokenizer(new StringReader(upper))); + assertCollation(tsUpper, tsLower, -1); + } + + /* + * For german, you might want oe to sort and match with o umlaut. + * This is not the default, but you can make a customized ruleset to do this. + * + * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. + * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 + */ + [Test] + public void TestCustomRules() + { + String DIN5007_2_tailorings = + "& ae , a\u0308 & AE , A\u0308" + + "& oe , o\u0308 & OE , O\u0308" + + "& ue , u\u0308 & UE , u\u0308"; + + string baseRules = RuleBasedCollator.GetCollationRules(new Icu.Locale("de-DE"), UColRuleOption.UCOL_TAILORING_ONLY); + //RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseRules + DIN5007_2_tailorings); + + string tailoredRules = baseRules + DIN5007_2_tailorings; + // + // at this point, you would save these tailoredRules to a file, + // and use the custom parameter. + // + String germanUmlaut = "Töne"; + String germanOE = "Toene"; + IDictionary<String, String> args = new Dictionary<String, String>(); + args.Put("custom", "rules.txt"); + args.Put("strength", "primary"); + ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(args); + factory.Inform(new StringMockResourceLoader(tailoredRules)); + TokenStream tsUmlaut = factory.Create( + new KeywordTokenizer(new StringReader(germanUmlaut))); + TokenStream tsOE = factory.Create( + new KeywordTokenizer(new StringReader(germanOE))); + + assertCollatesToSame(tsUmlaut, tsOE); + } + + private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) + { + assertCollation(stream1, stream2, 0); + } + + private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) + { + ICharTermAttribute term1 = stream1 + .AddAttribute<ICharTermAttribute>(); + ICharTermAttribute term2 = stream2 + .AddAttribute<ICharTermAttribute>(); + stream1.Reset(); + stream2.Reset(); + assertTrue(stream1.IncrementToken()); + assertTrue(stream2.IncrementToken()); + assertEquals(Number.Signum(comparison), Number.Signum(term1.toString().CompareToOrdinal(term2.toString()))); + assertFalse(stream1.IncrementToken()); + assertFalse(stream2.IncrementToken()); + stream1.End(); + stream2.End(); + stream1.Dispose(); + stream2.Dispose(); + } + + private class StringMockResourceLoader : IResourceLoader + { + String text; + + internal StringMockResourceLoader(String text) + { + this.text = text; + } + + public T NewInstance<T>(String cname) + { + return default(T); + } + + public Type FindType(String cname) + { + return null; + } + + public Stream OpenResource(String resource) + { + return new MemoryStream(Encoding.UTF8.GetBytes(text)); + } + } + + private TokenFilterFactory tokenFilterFactory(String name, params String[] keysAndValues) + { + Type clazz = TokenFilterFactory.LookupClass(name); + if (keysAndValues.Length % 2 == 1) + { + throw new ArgumentException("invalid keysAndValues map"); + } + IDictionary<String, String> args = new Dictionary<String, String>(); + for (int i = 0; i < keysAndValues.Length; i += 2) + { + String prev = args.Put(keysAndValues[i], keysAndValues[i + 1]); + assertNull("duplicate values for key: " + keysAndValues[i], prev); + } + String previous = args.Put("luceneMatchVersion", TEST_VERSION_CURRENT.toString()); + assertNull("duplicate values for key: luceneMatchVersion", previous); + TokenFilterFactory factory = null; + try + { + //factory = clazz.getConstructor(Map.class).newInstance(args); + factory = (TokenFilterFactory)Activator.CreateInstance(clazz, args); + } + catch (TargetInvocationException e) + { + // to simplify tests that check for illegal parameters + if (e.InnerException is ArgumentException) + { + throw (ArgumentException)e.InnerException; + } + else + { + throw e; + } + } + if (factory is IResourceLoaderAware) + { + ((IResourceLoaderAware)factory).Inform(new ClasspathResourceLoader(GetType())); + } + return factory; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj b/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj index 84d660a..8f5e312 100644 --- a/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj +++ b/src/Lucene.Net.Tests.ICU/Lucene.Net.Tests.ICU.csproj @@ -74,6 +74,18 @@ <Compile Include="..\Lucene.Net.Tests.Analysis.Common\Analysis\Util\TestSegmentingTokenizerBase.cs"> <Link>Analysis\Util\TestSegmentingTokenizerBase.cs</Link> </Compile> + <Compile Include="..\Lucene.Net.Tests.Analysis.ICU\Collation\TestICUCollationDocValuesField.cs"> + <Link>Collation\TestICUCollationDocValuesField.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Tests.Analysis.ICU\Collation\TestICUCollationKeyAnalyzer.cs"> + <Link>Collation\TestICUCollationKeyAnalyzer.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Tests.Analysis.ICU\Collation\TestICUCollationKeyFilter.cs"> + <Link>Collation\TestICUCollationKeyFilter.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Tests.Analysis.ICU\Collation\TestICUCollationKeyFilterFactory.cs"> + <Link>Collation\TestICUCollationKeyFilterFactory.cs</Link> + </Compile> <Compile Include="..\Lucene.Net.Tests.Highlighter\PostingsHighlight\TestMultiTermHighlighting.cs"> <Link>Search\PostingsHighlight\TestMultiTermHighlighting.cs</Link> </Compile> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net.Tests.ICU/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.ICU/project.json b/src/Lucene.Net.Tests.ICU/project.json index 4dc5c7a..1c3c0cd 100644 --- a/src/Lucene.Net.Tests.ICU/project.json +++ b/src/Lucene.Net.Tests.ICU/project.json @@ -26,6 +26,7 @@ }, "dependencies": { "dotnet-test-nunit-teamcity": "3.4.0-beta-3", + "icu.net": "54.1.1-alpha", "Lucene.Net": "4.8.0", "Lucene.Net.Analysis.Common": "4.8.0", "Lucene.Net.Highlighter": "4.8.0", http://git-wip-us.apache.org/repos/asf/lucenenet/blob/a4989ea1/src/Lucene.Net/Support/StringBuilderExtensions.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net/Support/StringBuilderExtensions.cs b/src/Lucene.Net/Support/StringBuilderExtensions.cs index 5200e02..ae68811 100644 --- a/src/Lucene.Net/Support/StringBuilderExtensions.cs +++ b/src/Lucene.Net/Support/StringBuilderExtensions.cs @@ -110,6 +110,29 @@ namespace Lucene.Net.Support } /// <summary> + /// Returns the character (Unicode code point) at the specified index. + /// The index refers to char values (Unicode code units) and ranges from 0 to Length - 1. + /// <para/> + /// If the char value specified at the given index is in the high-surrogate range, + /// the following index is less than the length of this sequence, and the char value + /// at the following index is in the low-surrogate range, then the + /// supplementary code point corresponding to this surrogate pair is returned. + /// Otherwise, the char value at the given index is returned. + /// </summary> + /// <param name="text">this <see cref="StringBuilder"/></param> + /// <param name="index">the index to the char values</param> + /// <returns>the code point value of the character at the index</returns> + /// <exception cref="IndexOutOfRangeException">if the index argument is negative or not less than the length of this sequence.</exception> + public static int CodePointAt(this StringBuilder text, int index) + { + if ((index < 0) || (index >= text.Length)) + { + throw new IndexOutOfRangeException(); + } + return Character.CodePointAt(text.ToString(), index); + } + + /// <summary> /// Copies the array from the <see cref="StringBuilder"/> into a new array /// and returns it. /// </summary>
