http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/Contrib.Queries.csproj ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/Contrib.Queries.csproj b/src/contrib/Queries/Contrib.Queries.csproj deleted file mode 100644 index 00cea7f..0000000 --- a/src/contrib/Queries/Contrib.Queries.csproj +++ /dev/null @@ -1,168 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<!-- - - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - ---> -<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <PropertyGroup> - <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> - <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> - <ProductVersion>9.0.21022</ProductVersion> - <SchemaVersion>2.0</SchemaVersion> - <ProjectGuid>{481CF6E3-52AF-4621-9DEB-022122079AF6}</ProjectGuid> - <AppDesignerFolder>Properties</AppDesignerFolder> - <RootNamespace>Lucene.Net.Search</RootNamespace> - <AssemblyName>Lucene.Net.Contrib.Queries</AssemblyName> - <FileAlignment>512</FileAlignment> - <FileUpgradeFlags> - </FileUpgradeFlags> - <OldToolsVersion>3.5</OldToolsVersion> - <UpgradeBackupLocation /> - <PublishUrl>publish\</PublishUrl> - <Install>true</Install> - <InstallFrom>Disk</InstallFrom> - <UpdateEnabled>false</UpdateEnabled> - <UpdateMode>Foreground</UpdateMode> - <UpdateInterval>7</UpdateInterval> - <UpdateIntervalUnits>Days</UpdateIntervalUnits> - <UpdatePeriodically>false</UpdatePeriodically> - <UpdateRequired>false</UpdateRequired> - <MapFileExtensions>true</MapFileExtensions> - <ApplicationRevision>0</ApplicationRevision> - <ApplicationVersion>1.0.0.%2a</ApplicationVersion> - <IsWebBootstrapper>false</IsWebBootstrapper> - <UseApplicationTrust>false</UseApplicationTrust> - <BootstrapperEnabled>true</BootstrapperEnabled> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> - <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <DebugSymbols>true</DebugSymbols> - <DebugType>full</DebugType> - <Optimize>false</Optimize> - <OutputPath>..\..\..\build\bin\contrib\Queries\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <DefineConstants>DEBUG;TRACE;$(Framework)</DefineConstants> - <ErrorReport>prompt</ErrorReport> - <WarningLevel>4</WarningLevel> - <NoWarn>618</NoWarn> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug35|AnyCPU' "> - <TargetFrameworkVersion>v3.5</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <DebugSymbols>true</DebugSymbols> - <DebugType>full</DebugType> - <Optimize>false</Optimize> - <OutputPath>..\..\..\build\bin\contrib\Queries\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <DefineConstants>DEBUG;TRACE;$(Framework)</DefineConstants> - <ErrorReport>prompt</ErrorReport> - <WarningLevel>4</WarningLevel> - <NoWarn>618</NoWarn> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> - <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <DebugType>pdbonly</DebugType> - <Optimize>true</Optimize> - <OutputPath>..\..\..\build\bin\contrib\Queries\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <DefineConstants>TRACE;$(Framework)</DefineConstants> - <ErrorReport>prompt</ErrorReport> - <WarningLevel>4</WarningLevel> - <DocumentationFile>..\..\..\build\bin\contrib\Queries\$(Configuration.Replace("35", ""))\$(Framework)\Lucene.Net.Contrib.Queries.XML</DocumentationFile> - <NoWarn>618</NoWarn> - <DebugSymbols>true</DebugSymbols> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release35|AnyCPU' "> - <TargetFrameworkVersion>v3.5</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <DebugType>pdbonly</DebugType> - <Optimize>true</Optimize> - <OutputPath>..\..\..\build\bin\contrib\Queries\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <DefineConstants>TRACE;$(Framework)</DefineConstants> - <ErrorReport>prompt</ErrorReport> - <WarningLevel>4</WarningLevel> - <DocumentationFile>..\..\..\build\bin\contrib\Queries\$(Configuration.Replace("35", ""))\$(Framework)\Lucene.Net.Contrib.Queries.XML</DocumentationFile> - <NoWarn>618</NoWarn> - <DebugSymbols>true</DebugSymbols> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup> - <SignAssembly>true</SignAssembly> - </PropertyGroup> - <PropertyGroup> - <AssemblyOriginatorKeyFile>Lucene.Net.snk</AssemblyOriginatorKeyFile> - </PropertyGroup> - <ItemGroup> - <Reference Include="System" /> - <Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" /> - </ItemGroup> - <ItemGroup> - <Compile Include="BooleanFilter.cs" /> - <Compile Include="BoostingQuery.cs" /> - <Compile Include="DuplicateFilter.cs" /> - <Compile Include="FilterClause.cs" /> - <Compile Include="FuzzyLikeThisQuery.cs" /> - <Compile Include="Similar\MoreLikeThis.cs" /> - <Compile Include="Properties\AssemblyInfo.cs" /> - <Compile Include="Similar\MoreLikeThisQuery.cs" /> - <Compile Include="Similar\SimilarityQueries.cs" /> - <Compile Include="TermsFilter.cs" /> - </ItemGroup> - <ItemGroup> - <ProjectReference Include="..\..\core\Lucene.Net.csproj"> - <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project> - <Name>Lucene.Net</Name> - </ProjectReference> - </ItemGroup> - <ItemGroup> - <BootstrapperPackage Include=".NETFramework,Version=v4.0"> - <Visible>False</Visible> - <ProductName>Microsoft .NET Framework 4 %28x86 and x64%29</ProductName> - <Install>true</Install> - </BootstrapperPackage> - <BootstrapperPackage Include="Microsoft.Net.Client.3.5"> - <Visible>False</Visible> - <ProductName>.NET Framework 3.5 SP1 Client Profile</ProductName> - <Install>false</Install> - </BootstrapperPackage> - <BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1"> - <Visible>False</Visible> - <ProductName>.NET Framework 3.5 SP1</ProductName> - <Install>false</Install> - </BootstrapperPackage> - <BootstrapperPackage Include="Microsoft.Windows.Installer.3.1"> - <Visible>False</Visible> - <ProductName>Windows Installer 3.1</ProductName> - <Install>true</Install> - </BootstrapperPackage> - </ItemGroup> - <ItemGroup> - <None Include="Lucene.Net.snk" /> - </ItemGroup> - <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> - <!-- To modify your build process, add your task inside one of the targets below and uncomment it. - Other similar extension points exist, see Microsoft.Common.targets. - <Target Name="BeforeBuild"> - </Target> - <Target Name="AfterBuild"> - </Target> - --> -</Project> \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/DuplicateFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/DuplicateFilter.cs b/src/contrib/Queries/DuplicateFilter.cs deleted file mode 100644 index d5f4745..0000000 --- a/src/contrib/Queries/DuplicateFilter.cs +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -using Lucene.Net.Search; -using Lucene.Net.Index; -using Lucene.Net.Util; - -namespace Lucene.Net.Search -{ - public class DuplicateFilter : Filter - { - String fieldName; - - /* - * KeepMode determines which document id to consider as the master, all others being - * identified as duplicates. Selecting the "first occurrence" can potentially save on IO. - */ - int keepMode = KM_USE_FIRST_OCCURRENCE; - public static int KM_USE_FIRST_OCCURRENCE = 1; - public static int KM_USE_LAST_OCCURRENCE = 2; - - /* - * "Full" processing mode starts by setting all bits to false and only setting bits - * for documents that contain the given field and are identified as none-duplicates. - - * "Fast" processing sets all bits to true then unsets all duplicate docs found for the - * given field. This approach avoids the need to read TermDocs for terms that are seen - * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially - * faster approach , the downside is that bitsets produced will include bits set for - * documents that do not actually contain the field given. - * - */ - int processingMode = PM_FULL_VALIDATION; - public static int PM_FULL_VALIDATION = 1; - public static int PM_FAST_INVALIDATION = 2; - - - - public DuplicateFilter(String fieldName) : this(fieldName, KM_USE_LAST_OCCURRENCE, PM_FULL_VALIDATION) - { - } - - - public DuplicateFilter(String fieldName, int keepMode, int processingMode) - { - this.fieldName = fieldName; - this.keepMode = keepMode; - this.processingMode = processingMode; - } - - public override DocIdSet GetDocIdSet(IndexReader reader) - { - if (processingMode == PM_FAST_INVALIDATION) - { - return FastBits(reader); - } - else - { - return CorrectBits(reader); - } - } - - private OpenBitSet CorrectBits(IndexReader reader) - { - OpenBitSet bits = new OpenBitSet(reader.MaxDoc); //assume all are INvalid - Term startTerm = new Term(fieldName); - TermEnum te = reader.Terms(startTerm); - if (te != null) - { - Term currTerm = te.Term; - while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned - { - int lastDoc = -1; - //set non duplicates - TermDocs td = reader.TermDocs(currTerm); - if (td.Next()) - { - if (keepMode == KM_USE_FIRST_OCCURRENCE) - { - bits.Set(td.Doc); - } - else - { - do - { - lastDoc = td.Doc; - } while (td.Next()); - bits.Set(lastDoc); - } - } - if (!te.Next()) - { - break; - } - currTerm = te.Term; - } - } - return bits; - } - - private OpenBitSet FastBits(IndexReader reader) - { - OpenBitSet bits = new OpenBitSet(reader.MaxDoc); - bits.Set(0, reader.MaxDoc); //assume all are valid - Term startTerm = new Term(fieldName); - TermEnum te = reader.Terms(startTerm); - if (te != null) - { - Term currTerm = te.Term; - - while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned - { - if (te.DocFreq() > 1) - { - int lastDoc = -1; - //unset potential duplicates - TermDocs td = reader.TermDocs(currTerm); - td.Next(); - if (keepMode == KM_USE_FIRST_OCCURRENCE) - { - td.Next(); - } - do - { - lastDoc = td.Doc; - bits.Clear(lastDoc); - } while (td.Next()); - if (keepMode == KM_USE_LAST_OCCURRENCE) - { - //restore the last bit - bits.Set(lastDoc); - } - } - if (!te.Next()) - { - break; - } - currTerm = te.Term; - } - } - return bits; - } - - public string FieldName - { - get { return fieldName; } - set { this.fieldName = value; } - } - - public int KeepMode - { - get { return keepMode; } - set { this.keepMode = value; } - } - - public override bool Equals(Object obj) - { - if (this == obj) - return true; - if ((obj == null) || (obj.GetType()!= this.GetType())) - return false; - DuplicateFilter other = (DuplicateFilter)obj; - return keepMode == other.keepMode && - processingMode == other.processingMode && - (fieldName == other.fieldName || (fieldName != null && fieldName.Equals(other.fieldName))); - } - - public override int GetHashCode() - { - int hash = 217; - hash = 31 * hash + keepMode; - hash = 31 * hash + processingMode; - hash = 31 * hash + fieldName.GetHashCode(); - return hash; - } - - public int ProcessingMode - { - get { return processingMode; } - set { this.processingMode = value; } - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/FilterClause.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/FilterClause.cs b/src/contrib/Queries/FilterClause.cs deleted file mode 100644 index f292914..0000000 --- a/src/contrib/Queries/FilterClause.cs +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace Lucene.Net.Search -{ - /* - * A Filter that wrapped with an indication of how that filter - * is used when composed with another filter. - * (Follows the boolean logic in BooleanClause for composition - * of queries.) - */ - [Serializable] - public class FilterClause - { - Occur occur; - Filter filter; - - /* - * Create a new FilterClause - * @param filter A Filter object containing a BitSet - * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT - */ - public FilterClause(Filter filter, Occur occur) - { - this.occur = occur; - this.filter = filter; - } - - /* - * Returns this FilterClause's filter - * @return A Filter object - */ - - public Filter Filter - { - get { return filter; } - } - - /* - * Returns this FilterClause's occur parameter - * @return An Occur object - */ - - public Occur Occur - { - get { return occur; } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/FuzzyLikeThisQuery.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/FuzzyLikeThisQuery.cs b/src/contrib/Queries/FuzzyLikeThisQuery.cs deleted file mode 100644 index 09f1c8c..0000000 --- a/src/contrib/Queries/FuzzyLikeThisQuery.cs +++ /dev/null @@ -1,422 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -using Lucene.Net.Search; -using Lucene.Net.Index; -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Support; -using Lucene.Net.Util; - -namespace Lucene.Net.Search -{ - /// <summary> - /// Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms. - /// In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration - /// of fuzzy scoring factors. - /// This generally produces good results for queries where users may provide details in a number of - /// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and - /// a fast query. - /// - /// For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because - /// we are not looking for matches on multiple variants in any one doc). Additionally, a specialized - /// TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer - /// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query - /// term) and this is factored into the variant's boost. If the source query term does not exist in the - /// index the average IDF of the variants is used. - /// </summary> - public class FuzzyLikeThisQuery : Query - { - static Similarity sim = new DefaultSimilarity(); - Query rewrittenQuery = null; - EquatableList<FieldVals> fieldVals = new EquatableList<FieldVals>(); - Analyzer analyzer; - - ScoreTermQueue q; - int MAX_VARIANTS_PER_TERM = 50; - bool ignoreTF = false; - private int maxNumTerms; - - public override int GetHashCode() - { - int prime = 31; - int result = 1; - result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode()); - result = prime * result - + ((fieldVals == null) ? 0 : fieldVals.GetHashCode()); - result = prime * result + (ignoreTF ? 1231 : 1237); - result = prime * result + maxNumTerms; - return result; - } - - public override bool Equals(Object obj) - { - if (this == obj) - return true; - if (obj == null) - return false; - if (GetType() != obj.GetType()) - return false; - FuzzyLikeThisQuery other = (FuzzyLikeThisQuery)obj; - if (analyzer == null) - { - if (other.analyzer != null) - return false; - } - else if (!analyzer.Equals(other.analyzer)) - return false; - if (fieldVals == null) - { - if (other.fieldVals != null) - return false; - } - else if (!fieldVals.Equals(other.fieldVals)) - return false; - if (ignoreTF != other.ignoreTF) - return false; - if (maxNumTerms != other.maxNumTerms) - return false; - return true; - } - - - /* - * - * <param name="maxNumTerms">The total number of terms clauses that will appear once rewritten as a BooleanQuery</param> - * <param name="analyzer"></param> - */ - public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) - { - q = new ScoreTermQueue(maxNumTerms); - this.analyzer = analyzer; - this.maxNumTerms = maxNumTerms; - } - - class FieldVals - { - internal String queryString; - internal String fieldName; - internal float minSimilarity; - internal int prefixLength; - public FieldVals(String name, float similarity, int length, String queryString) - { - fieldName = name; - minSimilarity = similarity; - prefixLength = length; - this.queryString = queryString; - } - - public override int GetHashCode() - { - int prime = 31; - int result = 1; - result = prime * result - + ((fieldName == null) ? 0 : fieldName.GetHashCode()); - result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity),0); - result = prime * result + prefixLength; - result = prime * result - + ((queryString == null) ? 0 : queryString.GetHashCode()); - return result; - } - - public override bool Equals(Object obj) - { - if (this == obj) - return true; - if (obj == null) - return false; - if (GetType() != obj.GetType()) - return false; - FieldVals other = (FieldVals)obj; - if (fieldName == null) - { - if (other.fieldName != null) - return false; - } - else if (!fieldName.Equals(other.fieldName)) - return false; - if (BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity), 0) != BitConverter.ToInt32(BitConverter.GetBytes(other.minSimilarity), 0)) - //if (Float.floatToIntBits(minSimilarity) != Float.floatToIntBits(other.minSimilarity)) - return false; - if (prefixLength != other.prefixLength) - return false; - if (queryString == null) - { - if (other.queryString != null) - return false; - } - else if (!queryString.Equals(other.queryString)) - return false; - return true; - } - - - - } - - /* - * <summary>Adds user input for "fuzzification" </summary> - * <param name="queryString">The string which will be parsed by the analyzer and for which fuzzy variants will be parsed</param> - * <param name="fieldName"></param> - * <param name="minSimilarity">The minimum similarity of the term variants (see FuzzyTermEnum)</param> - * <param name="prefixLength">Length of required common prefix on variant terms (see FuzzyTermEnum)</param> - */ - public void AddTerms(String queryString, String fieldName, float minSimilarity, int prefixLength) - { - fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString)); - } - - - private void AddTerms(IndexReader reader, FieldVals f) - { - if (f.queryString == null) return; - TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString)); - ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>(); - - int corpusNumDocs = reader.NumDocs(); - Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects - HashSet<string> processedTerms = new HashSet<string>(); - while (ts.IncrementToken()) - { - String term = termAtt.Term; - if (!processedTerms.Contains(term)) - { - processedTerms.Add(term); - ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term - float minScore = 0; - Term startTerm = internSavingTemplateTerm.CreateTerm(term); - FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength); - TermEnum origEnum = reader.Terms(startTerm); - int df = 0; - if (startTerm.Equals(origEnum.Term)) - { - df = origEnum.DocFreq(); //store the df so all variants use same idf - } - int numVariants = 0; - int totalVariantDocFreqs = 0; - do - { - Term possibleMatch = fe.Term; - if (possibleMatch != null) - { - numVariants++; - totalVariantDocFreqs += fe.DocFreq(); - float score = fe.Difference(); - if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) - { - ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm); - variantsQ.InsertWithOverflow(st); - minScore = variantsQ.Top().Score; // maintain minScore - } - } - } - while (fe.Next()); - if (numVariants > 0) - { - int avgDf = totalVariantDocFreqs / numVariants; - if (df == 0)//no direct match we can use as df for all variants - { - df = avgDf; //use avg df of all variants - } - - // take the top variants (scored by edit distance) and reset the score - // to include an IDF factor then add to the global queue for ranking - // overall top query terms - int size = variantsQ.Size(); - for (int i = 0; i < size; i++) - { - ScoreTerm st = variantsQ.Pop(); - st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs); - q.InsertWithOverflow(st); - } - } - } - } - } - - public override Query Rewrite(IndexReader reader) - { - if (rewrittenQuery != null) - { - return rewrittenQuery; - } - //load up the list of possible terms - foreach (FieldVals f in fieldVals) - { - AddTerms(reader, f); - } - //clear the list of fields - fieldVals.Clear(); - - BooleanQuery bq = new BooleanQuery(); - - - //create BooleanQueries to hold the variants for each token/field pair and ensure it - // has no coord factor - //Step 1: sort the termqueries by term/field - HashMap<Term, List<ScoreTerm>> variantQueries = new HashMap<Term, List<ScoreTerm>>(); - int size = q.Size(); - for (int i = 0; i < size; i++) - { - ScoreTerm st = q.Pop(); - var l = variantQueries[st.fuzziedSourceTerm]; - if (l == null) - { - l = new List<ScoreTerm>(); - variantQueries.Add(st.fuzziedSourceTerm, l); - } - l.Add(st); - } - //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries - foreach(var variants in variantQueries.Values) - { - if (variants.Count == 1) - { - //optimize where only one selected variant - ScoreTerm st = variants[0]; - TermQuery tq = new FuzzyTermQuery(st.Term, ignoreTF); - tq.Boost = st.Score; // set the boost to a mix of IDF and score - bq.Add(tq, Occur.SHOULD); - } - else - { - BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants - foreach(ScoreTerm st in variants) - { - TermQuery tq = new FuzzyTermQuery(st.Term, ignoreTF); // found a match - tq.Boost = st.Score; // set the boost using the ScoreTerm's score - termVariants.Add(tq, Occur.SHOULD); // add to query - } - bq.Add(termVariants, Occur.SHOULD); // add to query - } - } - //TODO possible alternative step 3 - organize above booleans into a new layer of field-based - // booleans with a minimum-should-match of NumFields-1? - bq.Boost = Boost; - this.rewrittenQuery = bq; - return bq; - } - - //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best - // term variants) then is reset with IDF for use in ranking against all other - // terms/fields - private class ScoreTerm - { - public Term Term { get; set; } - public float Score { get; set; } - - internal Term fuzziedSourceTerm; - - public ScoreTerm(Term term, float score, Term fuzziedSourceTerm) - { - this.Term = term; - this.Score = score; - this.fuzziedSourceTerm = fuzziedSourceTerm; - } - } - - private class ScoreTermQueue : PriorityQueue<ScoreTerm> - { - public ScoreTermQueue(int size) - { - Initialize(size); - } - - /* (non-Javadoc) - * <see cref="org.apache.lucene.util.PriorityQueue.lessThan(java.lang.Object, java.lang.Object)"/> - */ - public override bool LessThan(ScoreTerm termA, ScoreTerm termB) - { - if (termA.Score == termB.Score) - return termA.Term.CompareTo(termB.Term) > 0; - else - return termA.Score < termB.Score; - } - - } - - //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery) - private class FuzzyTermQuery : TermQuery - { - bool ignoreTF; - - public FuzzyTermQuery(Term t, bool ignoreTF): base(t) - { - this.ignoreTF = ignoreTF; - } - - public override Similarity GetSimilarity(Searcher searcher) - { - Similarity result = base.GetSimilarity(searcher); - result = new AnonymousSimilarityDelegator(this,result); - return result; - } - - class AnonymousSimilarityDelegator : SimilarityDelegator - { - FuzzyTermQuery parent = null; - public AnonymousSimilarityDelegator(FuzzyTermQuery parent,Similarity result) : base(result) - { - this.parent = parent; - } - - public override float Tf(float freq) - { - if (parent.ignoreTF) - { - return 1; //ignore tf - } - return base.Tf(freq); - } - - public override float Idf(int docFreq, int numDocs) - { - //IDF is already factored into individual term boosts - return 1; - } - } - } - - - /* (non-Javadoc) - * <see cref="org.apache.lucene.search.Query.toString(java.lang.String)"/> - */ - public override String ToString(String field) - { - return null; - } - - - public bool IsIgnoreTF() - { - return ignoreTF; - } - - - public void SetIgnoreTF(bool ignoreTF) - { - this.ignoreTF = ignoreTF; - } - - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/Properties/AssemblyInfo.cs b/src/contrib/Queries/Properties/AssemblyInfo.cs deleted file mode 100644 index 0938b45..0000000 --- a/src/contrib/Queries/Properties/AssemblyInfo.cs +++ /dev/null @@ -1,60 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System.Reflection; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Security; - -// General Information about an assembly is controlled through the following -// set of attributes. Change these attribute values to modify the information -// associated with an assembly. -[assembly: AssemblyTitle("Lucene.Net.Contrib.Queries")] -[assembly: AssemblyDescription("")] -[assembly: AssemblyConfiguration("")] -[assembly: AssemblyCompany("The Apache Software Foundation")] -[assembly: AssemblyProduct("Lucene.Net.Contrib.Queries")] -[assembly: AssemblyCopyright("Copyright 2006 - 2011 The Apache Software Foundation")] -[assembly: AssemblyTrademark("Copyright 2006 - 2011 The Apache Software Foundation")] -[assembly: AssemblyCulture("")] - -// Setting ComVisible to false makes the types in this assembly not visible -// to COM components. If you need to access a type in this assembly from -// COM, set the ComVisible attribute to true on that type. -[assembly: ComVisible(false)] - -// The following GUID is for the ID of the typelib if this project is exposed to COM -[assembly: Guid("6107399b-3ded-4abc-ab60-9e41754258e1")] - -// Version information for an assembly consists of the following four values: -// -// Major Version -// Minor Version -// Build Number -// Revision -// -// You can specify all the values or you can default the Build and Revision Numbers -// by using the '*' as shown below: -// [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("3.0.3")] -[assembly: AssemblyFileVersion("3.0.3")] - -[assembly: AllowPartiallyTrustedCallers] http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/README.txt ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/README.txt b/src/contrib/Queries/README.txt deleted file mode 100644 index 0db8236..0000000 --- a/src/contrib/Queries/README.txt +++ /dev/null @@ -1,22 +0,0 @@ -This module contains a number of filter and query objects that add to core lucene. - -==== The "MoreLikeThis" class from the "similarity" module has been copied into here. -If people are generally happy with this move then the similarity module can be deleted, or at least a -"Moved to queries module..." note left in its place. - -==== FuzzyLikeThis - mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration -of fuzzy scoring factors. This generally produces good results for queries where users may provide details in a number of -fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching. The query is fast because, like -MoreLikeThis, it optimizes the query to only the most distinguishing terms. - -==== BoostingQuery - effectively demotes search results that match a given query. -Unlike the "NOT" clause, this still selects documents that contain undesirable terms, -but reduces the overall score of docs containing these terms. - -==== TermsFilter - Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in -a sequence. An example might be a collection of primary keys from a database query result or perhaps -a choice of "category" labels picked by the end user. - - -Mark Harwood -25/02/2006 http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/Similar/MoreLikeThis.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/Similar/MoreLikeThis.cs b/src/contrib/Queries/Similar/MoreLikeThis.cs deleted file mode 100644 index 62883a8..0000000 --- a/src/contrib/Queries/Similar/MoreLikeThis.cs +++ /dev/null @@ -1,945 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using Lucene.Net.Index; -using Lucene.Net.Store; -using Lucene.Net.Support; -using Lucene.Net.Util; -using IndexReader = Lucene.Net.Index.IndexReader; -using Term = Lucene.Net.Index.Term; -using BooleanClause = Lucene.Net.Search.BooleanClause; -using DefaultSimilarity = Lucene.Net.Search.DefaultSimilarity; -using TermQuery = Lucene.Net.Search.TermQuery; -using BooleanQuery = Lucene.Net.Search.BooleanQuery; -using IndexSearcher = Lucene.Net.Search.IndexSearcher; -using Query = Lucene.Net.Search.Query; -using Analyzer = Lucene.Net.Analysis.Analyzer; -using TokenStream = Lucene.Net.Analysis.TokenStream; -using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer; -using Document = Lucene.Net.Documents.Document; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Search.Similar -{ - /// <summary> Generate "more like this" similarity queries. - /// Based on this mail: - /// <pre> - /// Lucene does let you access the document frequency of terms, with IndexReader.DocFreq(). - /// Term frequencies can be computed by re-tokenizing the text, which, for a single document, - /// is usually fast enough. But looking up the DocFreq() of every term in the document is - /// probably too slow. - /// - /// You can use some heuristics to prune the set of terms, to avoid calling DocFreq() too much, - /// or at all. Since you're trying to maximize a tf*idf score, you're probably most interested - /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically - /// reduce the number of terms under consideration. Another heuristic is that terms with a - /// high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the - /// number of characters, not selecting anything less than, e.g., six or seven characters. - /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms - /// that do a pretty good job of characterizing a document. - /// - /// It all depends on what you're trying to do. If you're trying to eek out that last percent - /// of precision and recall regardless of computational difficulty so that you can win a TREC - /// competition, then the techniques I mention above are useless. But if you're trying to - /// provide a "more like this" button on a search results page that does a decent job and has - /// good performance, such techniques might be useful. - /// - /// An efficient, effective "more-like-this" query generator would be a great contribution, if - /// anyone's interested. I'd imagine that it would take a Reader or a String (the document's - /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those - /// above. The frequency and length thresholds could be parameters, etc. - /// - /// Doug - /// </pre> - /// - /// - /// <p/> - /// <h3>Initial Usage</h3> - /// - /// This class has lots of options to try to make it efficient and flexible. - /// See the body of <see cref="Main"/> below in the source for real code, or - /// if you want pseudo code, the simpliest possible usage is as follows. The bold - /// fragment is specific to this class. - /// - /// <pre> - /// - /// IndexReader ir = ... - /// IndexSearcher is = ... - /// <b> - /// MoreLikeThis mlt = new MoreLikeThis(ir); - /// Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b> - /// Query query = mlt.Like( target); - /// </b> - /// Hits hits = is.Search(query); - /// <em>// now the usual iteration thru 'hits' - the only thing to watch for is to make sure - /// you ignore the doc if it matches your 'target' document, as it should be similar to itself </em> - /// - /// </pre> - /// - /// Thus you: - /// <ol> - /// <li> do your normal, Lucene setup for searching,</li> - /// <li> create a MoreLikeThis,</li> - /// <li> get the text of the doc you want to find similaries to</li> - /// <li> then call one of the Like() calls to generate a similarity query</li> - /// <li> call the searcher to find the similar docs</li> - /// </ol> - /// - /// <h3>More Advanced Usage</h3> - /// - /// You may want to use <see cref="SetFieldNames"/> so you can examine - /// multiple fields (e.g. body and title) for similarity. - /// <p/> - /// - /// Depending on the size of your index and the size and makeup of your documents you - /// may want to call the other set methods to control how the similarity queries are - /// generated: - /// <ul> - /// <li> <see cref="MinTermFreq"/> </li> - /// <li> <see cref="MinDocFreq"/> </li> - /// <li> <see cref="MaxDocFreq"/></li> - /// <li> <see cref="SetMaxDocFreqPct(int)"/></li> - /// <li> <see cref="MinWordLen"/> </li> - /// <li> <see cref="MaxWordLen"/></li> - /// <li> <see cref="MaxQueryTerms"/></li> - /// <li> <see cref="MaxNumTokensParsed"/></li> - /// <li> <see cref="SetStopWords(ISet{string})"/> </li> - /// </ul> - /// - /// <hr/> - /// <pre> - /// Changes: Mark Harwood 29/02/04 - /// Some bugfixing, some refactoring, some optimisation. - /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code - /// - bugfix: No significant terms being created for fields with a termvector - because - /// was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector) - /// - refactor: moved common code into isNoiseWord() - /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization - /// </pre> - /// </summary> - public sealed class MoreLikeThis - { - - /// <summary> Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.</summary> - /// <seealso cref="MaxNumTokensParsed"> - /// </seealso> - public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; - - - /// <summary> Default analyzer to parse source doc with.</summary> - /// <seealso cref="Analyzer"> - /// </seealso> - public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer(Util.Version.LUCENE_CURRENT); - - /// <summary> Ignore terms with less than this frequency in the source doc.</summary> - /// <seealso cref="MinTermFreq"> - /// </seealso> - /// <seealso cref="MinTermFreq"> - /// </seealso> - public const int DEFAULT_MIN_TERM_FREQ = 2; - - /// <summary> Ignore words which do not occur in at least this many docs.</summary> - /// <seealso cref="MinDocFreq"> - /// </seealso> - /// <seealso cref="MinDocFreq"> - /// </seealso> - public const int DEFAULT_MIN_DOC_FREQ = 5; - - /// <summary> - /// Ignore words wich occur in more than this many docs - /// </summary> - /// <seealso cref="MaxDocFreq"/> - /// <seealso cref="MaxDocFreq"/> - public const int DEFAULT_MAX_DOC_FREQ = int.MaxValue; - - /// <summary> Boost terms in query based on score.</summary> - /// <seealso cref="Boost"> - /// </seealso> - /// <seealso cref="Boost"> - /// </seealso> - public const bool DEFAULT_BOOST = false; - - /// <summary> Default field names. Null is used to specify that the field names should be looked - /// up at runtime from the provided reader. - /// </summary> - public static readonly System.String[] DEFAULT_FIELD_NAMES = new System.String[] { "contents" }; - - /// <summary> Ignore words less than this length or if 0 then this has no effect.</summary> - /// <seealso cref="MinWordLen"> - /// </seealso> - /// <seealso cref="MinWordLen"> - /// </seealso> - public const int DEFAULT_MIN_WORD_LENGTH = 0; - - /// <summary> Ignore words greater than this length or if 0 then this has no effect.</summary> - /// <seealso cref="MaxWordLen"> - /// </seealso> - /// <seealso cref="MaxWordLen"> - /// </seealso> - public const int DEFAULT_MAX_WORD_LENGTH = 0; - - /// <summary> Default set of stopwords. - /// If null means to allow stop words. - /// - /// </summary> - /// <seealso cref="SetStopWords"> - /// </seealso> - /// <seealso cref="GetStopWords"> - /// </seealso> - public static readonly ISet<string> DEFAULT_STOP_WORDS = null; - - /// <summary> Current set of stop words.</summary> - private ISet<string> stopWords = DEFAULT_STOP_WORDS; - - /// <summary> Return a Query with no more than this many terms. - /// - /// </summary> - /// <seealso cref="BooleanQuery.MaxClauseCount"> - /// </seealso> - /// <seealso cref="MaxQueryTerms"> - /// </seealso> - /// <seealso cref="MaxQueryTerms"> - /// </seealso> - public const int DEFAULT_MAX_QUERY_TERMS = 25; - - /// <summary> Analyzer that will be used to parse the doc.</summary> - private Analyzer analyzer = DEFAULT_ANALYZER; - - /// <summary> Ignore words less freqent that this.</summary> - private int minTermFreq = DEFAULT_MIN_TERM_FREQ; - - /// <summary> Ignore words which do not occur in at least this many docs.</summary> - private int minDocFreq = DEFAULT_MIN_DOC_FREQ; - - /// <summary> - /// Ignore words which occur in more than this many docs. - /// </summary> - private int maxDocfreq = DEFAULT_MAX_DOC_FREQ; - - /// <summary> Should we apply a boost to the Query based on the scores?</summary> - private bool boost = DEFAULT_BOOST; - - /// <summary> Field name we'll analyze.</summary> - private System.String[] fieldNames = DEFAULT_FIELD_NAMES; - - /// <summary> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support</summary> - private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; - - /// <summary> Ignore words if less than this len.</summary> - private int minWordLen = DEFAULT_MIN_WORD_LENGTH; - - /// <summary> Ignore words if greater than this len.</summary> - private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; - - /// <summary> Don't return a query longer than this.</summary> - private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; - - /// <summary> For idf() calculations.</summary> - private Lucene.Net.Search.Similarity similarity = null; - - /// <summary> IndexReader to use</summary> - private IndexReader ir; - - /// <summary> Boost factor to use when boosting the terms </summary> - private float boostFactor = 1; - - /// <summary> - /// Gets or sets the boost factor used when boosting terms - /// </summary> - public float BoostFactor - { - get { return boostFactor; } - set { this.boostFactor = value; } - } - - /// <summary> Constructor requiring an IndexReader.</summary> - public MoreLikeThis(IndexReader ir) : this(ir,new DefaultSimilarity()) - { - } - - public MoreLikeThis(IndexReader ir, Lucene.Net.Search.Similarity sim) - { - this.ir = ir; - this.similarity = sim; - } - - public Similarity Similarity - { - get { return similarity; } - set { this.similarity = value; } - } - - /// <summary> Gets or sets the analyzer used to parse source doc with. The default analyzer - /// is the <see cref="DEFAULT_ANALYZER"/>. - /// <para /> - /// An analyzer is not required for generating a query with the - /// <see cref="Like(int)"/> method, all other 'like' methods require an analyzer. - /// </summary> - /// <value> the analyzer that will be used to parse source doc with. </value> - /// <seealso cref="DEFAULT_ANALYZER"> - /// </seealso> - public Analyzer Analyzer - { - get { return analyzer; } - set { this.analyzer = value; } - } - - /// <summary> - /// Gets or sets the frequency below which terms will be ignored in the source doc. The default - /// frequency is the <see cref="DEFAULT_MIN_TERM_FREQ"/>. - /// </summary> - public int MinTermFreq - { - get { return minTermFreq; } - set { this.minTermFreq = value; } - } - - /// <summary> - /// Gets or sets the frequency at which words will be ignored which do not occur in at least this - /// many docs. The default frequency is <see cref="DEFAULT_MIN_DOC_FREQ"/>. - /// </summary> - public int MinDocFreq - { - get { return minDocFreq; } - set { this.minDocFreq = value; } - } - - /// <summary> - /// Gets or sets the maximum frequency in which words may still appear. - /// Words that appear in more than this many docs will be ignored. The default frequency is - /// <see cref="DEFAULT_MAX_DOC_FREQ"/> - /// </summary> - public int MaxDocFreq - { - get { return this.maxDocfreq; } - set { this.maxDocfreq = value; } - } - - /// <summary> - /// Set the maximum percentage in which words may still appear. Words that appear - /// in more than this many percent of all docs will be ignored. - /// </summary> - /// <param name="maxPercentage"> - /// the maximum percentage of documents (0-100) that a term may appear - /// in to be still considered relevant - /// </param> - public void SetMaxDocFreqPct(int maxPercentage) - { - this.maxDocfreq = maxPercentage * ir.NumDocs() / 100; - } - - /// <summary> Gets or sets a boolean indicating whether to boost terms in query based - /// on "score" or not. The default is <see cref="DEFAULT_BOOST"/>. - /// </summary> - public bool Boost - { - get { return boost; } - set { this.boost = value; } - } - - /// <summary> Returns the field names that will be used when generating the 'More Like This' query. - /// The default field names that will be used is <see cref="DEFAULT_FIELD_NAMES"/>. - /// - /// </summary> - /// <returns> the field names that will be used when generating the 'More Like This' query. - /// </returns> - public System.String[] GetFieldNames() - { - return fieldNames; - } - - /// <summary> Sets the field names that will be used when generating the 'More Like This' query. - /// Set this to null for the field names to be determined at runtime from the IndexReader - /// provided in the constructor. - /// - /// </summary> - /// <param name="fieldNames">the field names that will be used when generating the 'More Like This' - /// query. - /// </param> - public void SetFieldNames(System.String[] fieldNames) - { - this.fieldNames = fieldNames; - } - - /// <summary> - /// Gets or sets the minimum word length below which words will be ignored. - /// Set this to 0 for no minimum word length. The default is <see cref="DEFAULT_MIN_WORD_LENGTH"/>. - /// </summary> - public int MinWordLen - { - get { return minWordLen; } - set { this.minWordLen = value; } - } - - /// <summary> - /// Gets or sets the maximum word length above which words will be ignored. Set this to 0 for no - /// maximum word length. The default is <see cref="DEFAULT_MAX_WORD_LENGTH"/>. - /// </summary> - public int MaxWordLen - { - get { return maxWordLen; } - set { this.maxWordLen = value; } - } - - /// <summary> Set the set of stopwords. - /// Any word in this set is considered "uninteresting" and ignored. - /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as - /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". - /// - /// </summary> - /// <param name="stopWords">set of stopwords, if null it means to allow stop words - /// - /// </param> - /// <seealso cref="Lucene.Net.Analysis.StopFilter.MakeStopSet(string[])"> - /// </seealso> - /// <seealso cref="GetStopWords"> - /// </seealso> - public void SetStopWords(ISet<string> stopWords) - { - this.stopWords = stopWords; - } - - /// <summary> Get the current stop words being used.</summary> - /// <seealso cref="SetStopWords"> - /// </seealso> - public ISet<string> GetStopWords() - { - return stopWords; - } - - - /// <summary> - /// Gets or sets the maximum number of query terms that will be included in any generated query. - /// The default is <see cref="DEFAULT_MAX_QUERY_TERMS"/>. - /// </summary> - public int MaxQueryTerms - { - get { return maxQueryTerms; } - set { this.maxQueryTerms = value; } - } - - /// <summary> - /// Gets or sets the maximum number of tokens to parse in each example doc - /// field that is not stored with TermVector support - /// </summary> - /// <seealso cref="DEFAULT_MAX_NUM_TOKENS_PARSED" /> - public int MaxNumTokensParsed - { - get { return maxNumTokensParsed; } - set { maxNumTokensParsed = value; } - } - - /// <summary>Return a query that will return docs like the passed lucene document ID.</summary> - /// <param name="docNum">the documentID of the lucene doc to generate the 'More Like This" query for.</param> - /// <returns> a query that will return docs like the passed lucene document ID.</returns> - public Query Like(int docNum) - { - if (fieldNames == null) - { - // gather list of valid fields from lucene - ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED); - fieldNames = fields.ToArray(); - } - - return CreateQuery(RetrieveTerms(docNum)); - } - - /// <summary> Return a query that will return docs like the passed file. - /// - /// </summary> - /// <returns> a query that will return docs like the passed file. - /// </returns> - public Query Like(System.IO.FileInfo f) - { - if (fieldNames == null) - { - // gather list of valid fields from lucene - ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED); - fieldNames = fields.ToArray(); - } - - return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default)); - } - - /// <summary> Return a query that will return docs like the passed URL. - /// - /// </summary> - /// <returns> a query that will return docs like the passed URL. - /// </returns> - public Query Like(System.Uri u) - { - return Like(new System.IO.StreamReader((System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default)); - } - - /// <summary> Return a query that will return docs like the passed stream. - /// - /// </summary> - /// <returns> a query that will return docs like the passed stream. - /// </returns> - public Query Like(System.IO.Stream is_Renamed) - { - return Like(new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default)); - } - - /// <summary> Return a query that will return docs like the passed Reader. - /// - /// </summary> - /// <returns> a query that will return docs like the passed Reader. - /// </returns> - public Query Like(System.IO.TextReader r) - { - return CreateQuery(RetrieveTerms(r)); - } - - /// <summary> Create the More like query from a PriorityQueue</summary> - private Query CreateQuery(PriorityQueue<object[]> q) - { - BooleanQuery query = new BooleanQuery(); - System.Object cur; - int qterms = 0; - float bestScore = 0; - - while (((cur = q.Pop()) != null)) - { - System.Object[] ar = (System.Object[])cur; - TermQuery tq = new TermQuery(new Term((System.String)ar[1], (System.String)ar[0])); - - if (boost) - { - if (qterms == 0) - { - bestScore = (float)ar[2]; - } - float myScore = (float)ar[2]; - - tq.Boost = boostFactor * myScore / bestScore; - } - - try - { - query.Add(tq, Occur.SHOULD); - } - catch (BooleanQuery.TooManyClauses ignore) - { - break; - } - - qterms++; - if (maxQueryTerms > 0 && qterms >= maxQueryTerms) - { - break; - } - } - - return query; - } - - /// <summary> Create a PriorityQueue from a word->tf map. - /// - /// </summary> - /// <param name="words">a map of words keyed on the word(String) with Int objects as the values. - /// </param> - private PriorityQueue<object[]> CreateQueue(IDictionary<string,Int> words) - { - // have collected all words in doc and their freqs - int numDocs = ir.NumDocs(); - FreqQ res = new FreqQ(words.Count); // will order words by score - - var it = words.Keys.GetEnumerator(); - while (it.MoveNext()) - { - // for every word - System.String word = it.Current; - - int tf = words[word].x; // term freq in the source doc - if (minTermFreq > 0 && tf < minTermFreq) - { - continue; // filter out words that don't occur enough times in the source - } - - // go through all the fields and find the largest document frequency - System.String topField = fieldNames[0]; - int docFreq = 0; - for (int i = 0; i < fieldNames.Length; i++) - { - int freq = ir.DocFreq(new Term(fieldNames[i], word)); - topField = (freq > docFreq) ? fieldNames[i] : topField; - docFreq = (freq > docFreq) ? freq : docFreq; - } - - if (minDocFreq > 0 && docFreq < minDocFreq) - { - continue; // filter out words that don't occur in enough docs - } - - if (docFreq > maxDocfreq) - { - continue; // filter out words that occur in too many docs - } - - if (docFreq == 0) - { - continue; // index update problem? - } - - float idf = similarity.Idf(docFreq, numDocs); - float score = tf * idf; - - // only really need 1st 3 entries, other ones are for troubleshooting - res.InsertWithOverflow(new System.Object[] { word, topField, score, idf, docFreq, tf }); - } - return res; - } - - /// <summary> Describe the parameters that control how the "more like this" query is formed.</summary> - public System.String DescribeParams() - { - System.Text.StringBuilder sb = new System.Text.StringBuilder(); - sb.Append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n"); - sb.Append("\t" + "minWordLen : " + minWordLen + "\n"); - sb.Append("\t" + "maxWordLen : " + maxWordLen + "\n"); - sb.Append("\t" + "fieldNames : \""); - System.String delim = ""; - for (int i = 0; i < fieldNames.Length; i++) - { - System.String fieldName = fieldNames[i]; - sb.Append(delim).Append(fieldName); - delim = ", "; - } - sb.Append("\n"); - sb.Append("\t" + "boost : " + boost + "\n"); - sb.Append("\t" + "minTermFreq : " + minTermFreq + "\n"); - sb.Append("\t" + "minDocFreq : " + minDocFreq + "\n"); - return sb.ToString(); - } - - /// <summary> Test driver. - /// Pass in "-i INDEX" and then either "-fn FILE" or "-url URL". - /// </summary> - [STAThread] - public static void Main(System.String[] a) - { - System.String indexName = "localhost_index"; - System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; - System.Uri url = null; - for (int i = 0; i < a.Length; i++) - { - if (a[i].Equals("-i")) - { - indexName = a[++i]; - } - else if (a[i].Equals("-f")) - { - fn = a[++i]; - } - else if (a[i].Equals("-url")) - { - url = new System.Uri(a[++i]); - } - } - - System.IO.StreamWriter temp_writer; - temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding); - temp_writer.AutoFlush = true; - System.IO.StreamWriter o = temp_writer; - FSDirectory dir = FSDirectory.Open(new DirectoryInfo(indexName)); - IndexReader r = IndexReader.Open(dir, true); - o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs"); - - MoreLikeThis mlt = new MoreLikeThis(r); - - o.WriteLine("Query generation parameters:"); - o.WriteLine(mlt.DescribeParams()); - o.WriteLine(); - - Query query = null; - if (url != null) - { - o.WriteLine("Parsing URL: " + url); - query = mlt.Like(url); - } - else if (fn != null) - { - o.WriteLine("Parsing file: " + fn); - query = mlt.Like(new System.IO.FileInfo(fn)); - } - - o.WriteLine("q: " + query); - o.WriteLine(); - IndexSearcher searcher = new IndexSearcher(dir, true); - - TopDocs hits = searcher.Search(query, null, 25); - int len = hits.TotalHits; - o.WriteLine("found: " + len + " documents matching"); - o.WriteLine(); - ScoreDoc[] scoreDocs = hits.ScoreDocs; - for (int i = 0; i < System.Math.Min(25, len); i++) - { - Document d = searcher.Doc(scoreDocs[i].Doc); - System.String summary = d.Get("summary"); - o.WriteLine("score : " + scoreDocs[i].Score); - o.WriteLine("url : " + d.Get("url")); - o.WriteLine("\ttitle : " + d.Get("title")); - if (summary != null) - o.WriteLine("\tsummary: " + d.Get("summary")); - o.WriteLine(); - } - } - - /// <summary> Find words for a more-like-this query former. - /// - /// </summary> - /// <param name="docNum">the id of the lucene document from which to find terms - /// </param> - private PriorityQueue<object[]> RetrieveTerms(int docNum) - { - IDictionary<string,Int> termFreqMap = new HashMap<string,Int>(); - for (int i = 0; i < fieldNames.Length; i++) - { - System.String fieldName = fieldNames[i]; - ITermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName); - - // field does not store term vector info - if (vector == null) - { - Document d = ir.Document(docNum); - System.String[] text = d.GetValues(fieldName); - if (text != null) - { - for (int j = 0; j < text.Length; j++) - { - AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName); - } - } - } - else - { - AddTermFrequencies(termFreqMap, vector); - } - } - - return CreateQueue(termFreqMap); - } - - /// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary> - /// <param name="termFreqMap">a Map of terms and their frequencies - /// </param> - /// <param name="vector">List of terms and their frequencies for a doc/field - /// </param> - private void AddTermFrequencies(IDictionary<string, Int> termFreqMap, ITermFreqVector vector) - { - System.String[] terms = vector.GetTerms(); - int[] freqs = vector.GetTermFrequencies(); - for (int j = 0; j < terms.Length; j++) - { - System.String term = terms[j]; - - if (IsNoiseWord(term)) - { - continue; - } - // increment frequency - Int cnt = termFreqMap[term]; - if (cnt == null) - { - cnt = new Int(); - termFreqMap[term] = cnt; - cnt.x = freqs[j]; - } - else - { - cnt.x += freqs[j]; - } - } - } - /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> - /// <param name="r">a source of text to be tokenized - /// </param> - /// <param name="termFreqMap">a Map of terms and their frequencies - /// </param> - /// <param name="fieldName">Used by analyzer for any special per-field analysis - /// </param> - private void AddTermFrequencies(System.IO.TextReader r, IDictionary<string,Int> termFreqMap, System.String fieldName) - { - TokenStream ts = analyzer.TokenStream(fieldName, r); - int tokenCount=0; - // for every token - ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>(); - - while (ts.IncrementToken()) { - string word = termAtt.Term; - tokenCount++; - if(tokenCount>maxNumTokensParsed) - { - break; - } - if(IsNoiseWord(word)){ - continue; - } - - // increment frequency - Int cnt = termFreqMap[word]; - if (cnt == null) { - termFreqMap[word] = new Int(); - } - else { - cnt.x++; - } - } - } - - - /// <summary>determines if the passed term is likely to be of interest in "more like" comparisons - /// - /// </summary> - /// <param name="term">The word being considered - /// </param> - /// <returns> true if should be ignored, false if should be used in further analysis - /// </returns> - private bool IsNoiseWord(System.String term) - { - int len = term.Length; - if (minWordLen > 0 && len < minWordLen) - { - return true; - } - if (maxWordLen > 0 && len > maxWordLen) - { - return true; - } - if (stopWords != null && stopWords.Contains(term)) - { - return true; - } - return false; - } - - - /// <summary> Find words for a more-like-this query former. - /// The result is a priority queue of arrays with one entry for <b>every word</b> in the document. - /// Each array has 6 elements. - /// The elements are: - /// <ol> - /// <li> The word (String)</li> - /// <li> The top field that this word comes from (String)</li> - /// <li> The score for this word (Float)</li> - /// <li> The IDF value (Float)</li> - /// <li> The frequency of this word in the index (Integer)</li> - /// <li> The frequency of this word in the source document (Integer)</li> - /// </ol> - /// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. - /// This method is exposed so that you can identify the "interesting words" in a document. - /// For an easier method to call see <see cref="RetrieveInterestingTerms(System.IO.TextReader)"/>. - /// - /// </summary> - /// <param name="r">the reader that has the content of the document - /// </param> - /// <returns> the most intresting words in the document ordered by score, with the highest scoring, or best entry, first - /// - /// </returns> - /// <seealso cref="RetrieveInterestingTerms(System.IO.TextReader)"> - /// </seealso> - public PriorityQueue<object[]> RetrieveTerms(System.IO.TextReader r) - { - IDictionary<string, Int> words = new HashMap<string,Int>(); - for (int i = 0; i < fieldNames.Length; i++) - { - System.String fieldName = fieldNames[i]; - AddTermFrequencies(r, words, fieldName); - } - return CreateQueue(words); - } - - - public System.String[] RetrieveInterestingTerms(int docNum) - { - List<object> al = new List<object>(maxQueryTerms); - PriorityQueue<object[]> pq = RetrieveTerms(docNum); - System.Object cur; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((cur = pq.Pop()) != null) && lim-- > 0) - { - System.Object[] ar = (System.Object[])cur; - al.Add(ar[0]); // the 1st entry is the interesting word - } - //System.String[] res = new System.String[al.Count]; - //return al.toArray(res); - return al.Select(x => x.ToString()).ToArray(); - } - - /// <summary> Convenience routine to make it easy to return the most interesting words in a document. - /// More advanced users will call <see cref="RetrieveTerms(System.IO.TextReader)"/> directly. - /// </summary> - /// <param name="r">the source document - /// </param> - /// <returns> the most interesting words in the document - /// - /// </returns> - /// <seealso cref="RetrieveTerms(System.IO.TextReader)"> - /// </seealso> - /// <seealso cref="MaxQueryTerms"> - /// </seealso> - public System.String[] RetrieveInterestingTerms(System.IO.TextReader r) - { - List<object> al = new List<object>(maxQueryTerms); - PriorityQueue<object[]> pq = RetrieveTerms(r); - System.Object cur; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((cur = pq.Pop()) != null) && lim-- > 0) - { - System.Object[] ar = (System.Object[])cur; - al.Add(ar[0]); // the 1st entry is the interesting word - } - //System.String[] res = new System.String[al.Count]; - // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res); - return al.Select(x => x.ToString()).ToArray(); - } - - /// <summary> PriorityQueue that orders words by score.</summary> - private class FreqQ : PriorityQueue<object[]> - { - internal FreqQ(int s) - { - Initialize(s); - } - - override public bool LessThan(System.Object[] aa, System.Object[] bb) - { - float fa = (float)aa[2]; - float fb = (float)bb[2]; - return (float)fa > (float)fb; - } - } - - /// <summary> Use for frequencies and to avoid renewing Integers.</summary> - private class Int - { - internal int x; - - internal Int() - { - x = 1; - } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/Similar/MoreLikeThisQuery.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/Similar/MoreLikeThisQuery.cs b/src/contrib/Queries/Similar/MoreLikeThisQuery.cs deleted file mode 100644 index ae071c0..0000000 --- a/src/contrib/Queries/Similar/MoreLikeThisQuery.cs +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -using Lucene.Net.Search; -using Lucene.Net.Analysis; -using Lucene.Net.Index; - -namespace Lucene.Net.Search.Similar -{ - /*<summary> - * A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg - * in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the - * actual MoreLikeThis object and obtain the real Query object. - * </summary> - */ - public class MoreLikeThisQuery : Query - { - private String likeText; - private String[] moreLikeFields; - private Analyzer analyzer; - float percentTermsToMatch = 0.3f; - int minTermFrequency = 1; - int maxQueryTerms = 5; - ISet<string> stopWords = null; - int minDocFreq = -1; - - - /*<summary></summary> - * <param name="moreLikeFields"></param> - * <param name="likeText"></param> - * <param name="analyzer"></param> - */ - public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) - { - this.likeText = likeText; - this.moreLikeFields = moreLikeFields; - this.analyzer = analyzer; - } - - public override Query Rewrite(IndexReader reader) - { - MoreLikeThis mlt = new MoreLikeThis(reader); - - mlt.SetFieldNames(moreLikeFields); - mlt.Analyzer = analyzer; - mlt.MinTermFreq = minTermFrequency; - if (minDocFreq >= 0) - { - mlt.MinDocFreq = minDocFreq; - } - mlt.MaxQueryTerms = maxQueryTerms; - mlt.SetStopWords(stopWords); - BooleanQuery bq = (BooleanQuery)mlt.Like( new System.IO.StringReader(likeText)); - BooleanClause[] clauses = bq.GetClauses(); - //make at least half the terms match - bq.MinimumNumberShouldMatch = (int)(clauses.Length * percentTermsToMatch); - return bq; - } - /* (non-Javadoc) - * <see cref="org.apache.lucene.search.Query.toString(java.lang.String)"/> - */ - public override String ToString(String field) - { - return "like:" + likeText; - } - - public float PercentTermsToMatch - { - get { return percentTermsToMatch; } - set { this.percentTermsToMatch = value; } - } - - public Analyzer Analyzer - { - get { return analyzer; } - set { this.analyzer = value; } - } - - public string LikeText - { - get { return likeText; } - set { this.likeText = value; } - } - - public int MaxQueryTerms - { - get { return maxQueryTerms; } - set { this.maxQueryTerms = value; } - } - - public int MinTermFrequency - { - get { return minTermFrequency; } - set { this.minTermFrequency = value; } - } - - public String[] GetMoreLikeFields() - { - return moreLikeFields; - } - - public void SetMoreLikeFields(String[] moreLikeFields) - { - this.moreLikeFields = moreLikeFields; - } - public ISet<string> GetStopWords() - { - return stopWords; - } - public void SetStopWords(ISet<string> stopWords) - { - this.stopWords = stopWords; - } - - public int MinDocFreq - { - get { return minDocFreq; } - set { this.minDocFreq = value; } - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Queries/Similar/SimilarityQueries.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Queries/Similar/SimilarityQueries.cs b/src/contrib/Queries/Similar/SimilarityQueries.cs deleted file mode 100644 index 2bbd5d4..0000000 --- a/src/contrib/Queries/Similar/SimilarityQueries.cs +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using Lucene.Net.Analysis; -using Lucene.Net.Index; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Search.Similar -{ - - /// <summary> Simple similarity measures. - /// - /// - /// </summary> - /// <seealso cref="Lucene.Net.Search.Similar.MoreLikeThis"> - /// </seealso> - public sealed class SimilarityQueries - { - /// <summary> </summary> - private SimilarityQueries() - { - } - - /// <summary> Simple similarity query generators. - /// Takes every unique word and forms a boolean query where all words are optional. - /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs. - /// The only caveat is the first hit returned <b>should be</b> your source document - you'll - /// need to then ignore that. - /// - /// <p/> - /// - /// So, if you have a code fragment like this: - /// <br/> - /// <code> - /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); - /// </code> - /// - /// <p/> - /// - /// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>. - /// - /// <p/> - /// The philosophy behind this method is "two documents are similar if they share lots of words". - /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. - /// - /// <P/> - /// This method is fail-safe in that if a long 'body' is passed in and - /// <see cref="BooleanQuery.Add"/> (used internally) - /// throws - /// <see cref="BooleanQuery.TooManyClauses"/>, the - /// query as it is will be returned. - /// </summary> - /// <param name="body">the body of the document you want to find similar documents to - /// </param> - /// <param name="a">the analyzer to use to parse the body - /// </param> - /// <param name="field">the field you want to search on, probably something like "contents" or "body" - /// </param> - /// <param name="stop">optional set of stop words to ignore - /// </param> - /// <returns> a query with all unique words in 'body' - /// </returns> - /// <throws> IOException this can't happen... </throws> - public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop) - { - TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); - ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>(); - - BooleanQuery tmp = new BooleanQuery(); - ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>(); // ignore dups - while (ts.IncrementToken()) - { - String word = termAtt.Term; - // ignore opt stop words - if (stop != null && stop.Contains(word)) - continue; - // ignore dups - if (already.Contains(word)) - continue; - already.Add(word); - // add to query - TermQuery tq = new TermQuery(new Term(field, word)); - try - { - tmp.Add(tq, Occur.SHOULD); - } - catch (BooleanQuery.TooManyClauses) - { - // fail-safe, just return what we have, not the end of the world - break; - } - } - return tmp; - } - } -} \ No newline at end of file
