Repository: lucenenet Updated Branches: refs/heads/api-work a02dd137d -> a73790837
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/WordNet/Syns2Index/Contrib.WordNet.Syns2Index.csproj ---------------------------------------------------------------------- diff --git a/src/contrib/WordNet/Syns2Index/Contrib.WordNet.Syns2Index.csproj b/src/contrib/WordNet/Syns2Index/Contrib.WordNet.Syns2Index.csproj deleted file mode 100644 index 5357413..0000000 --- a/src/contrib/WordNet/Syns2Index/Contrib.WordNet.Syns2Index.csproj +++ /dev/null @@ -1,205 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<!-- - - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - ---> -<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" DefaultTargets="Build"> - <PropertyGroup> - <ProjectType>Local</ProjectType> - <ProductVersion>8.0.30319</ProductVersion> - <SchemaVersion>2.0</SchemaVersion> - <ProjectGuid>{7563D4D9-AE91-42BA-A270-1D264660F6DF}</ProjectGuid> - <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> - <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> - <ApplicationIcon>App.ico</ApplicationIcon> - <AssemblyKeyContainerName /> - <AssemblyName>Lucene.Net.Contrib.WordNet.Syns2Index</AssemblyName> - <AssemblyOriginatorKeyFile /> - <DefaultClientScript>JScript</DefaultClientScript> - <DefaultHTMLPageLayout>Grid</DefaultHTMLPageLayout> - <DefaultTargetSchema>IE50</DefaultTargetSchema> - <DelaySign>false</DelaySign> - <RootNamespace>Syns2Index</RootNamespace> - <RunPostBuildEvent>OnBuildSuccess</RunPostBuildEvent> - <StartupObject /> - <FileUpgradeFlags></FileUpgradeFlags> - <OldToolsVersion>0.0</OldToolsVersion> - <UpgradeBackupLocation /> - <PublishUrl>publish\</PublishUrl> - <Install>true</Install> - <InstallFrom>Disk</InstallFrom> - <UpdateEnabled>false</UpdateEnabled> - <UpdateMode>Foreground</UpdateMode> - <UpdateInterval>7</UpdateInterval> - <UpdateIntervalUnits>Days</UpdateIntervalUnits> - <UpdatePeriodically>false</UpdatePeriodically> - <UpdateRequired>false</UpdateRequired> - <MapFileExtensions>true</MapFileExtensions> - <ApplicationRevision>0</ApplicationRevision> - <ApplicationVersion>1.0.0.%2a</ApplicationVersion> - <IsWebBootstrapper>false</IsWebBootstrapper> - <UseApplicationTrust>false</UseApplicationTrust> - <BootstrapperEnabled>true</BootstrapperEnabled> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> - <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <OutputPath>..\..\bin\contrib\WordNet\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <AllowUnsafeBlocks>false</AllowUnsafeBlocks> - <BaseAddress>285212672</BaseAddress> - <CheckForOverflowUnderflow>false</CheckForOverflowUnderflow> - <ConfigurationOverrideFile /> - <DefineConstants>DEBUG;TRACE;$(Framework)</DefineConstants> - <DocumentationFile></DocumentationFile> - <DebugSymbols>true</DebugSymbols> - <FileAlignment>4096</FileAlignment> - <NoStdLib>false</NoStdLib> - <NoWarn /> - <Optimize>false</Optimize> - <RegisterForComInterop>false</RegisterForComInterop> - <RemoveIntegerChecks>false</RemoveIntegerChecks> - <TreatWarningsAsErrors>false</TreatWarningsAsErrors> - <WarningLevel>4</WarningLevel> - <DebugType>full</DebugType> - <ErrorReport>prompt</ErrorReport> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug35|AnyCPU' "> - <TargetFrameworkVersion>v3.5</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <OutputPath>..\..\bin\contrib\WordNet\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <AllowUnsafeBlocks>false</AllowUnsafeBlocks> - <BaseAddress>285212672</BaseAddress> - <CheckForOverflowUnderflow>false</CheckForOverflowUnderflow> - <ConfigurationOverrideFile /> - <DefineConstants>DEBUG;TRACE;$(Framework)</DefineConstants> - <DocumentationFile></DocumentationFile> - <DebugSymbols>true</DebugSymbols> - <FileAlignment>4096</FileAlignment> - <NoStdLib>false</NoStdLib> - <NoWarn /> - <Optimize>false</Optimize> - <RegisterForComInterop>false</RegisterForComInterop> - <RemoveIntegerChecks>false</RemoveIntegerChecks> - <TreatWarningsAsErrors>false</TreatWarningsAsErrors> - <WarningLevel>4</WarningLevel> - <DebugType>full</DebugType> - <ErrorReport>prompt</ErrorReport> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> - <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <OutputPath>..\..\bin\contrib\WordNet\</OutputPath> - <AllowUnsafeBlocks>false</AllowUnsafeBlocks> - <BaseAddress>285212672</BaseAddress> - <CheckForOverflowUnderflow>false</CheckForOverflowUnderflow> - <ConfigurationOverrideFile /> - <DefineConstants>TRACE;$(Framework)</DefineConstants> - <DocumentationFile></DocumentationFile> - <DebugSymbols>true</DebugSymbols> - <FileAlignment>4096</FileAlignment> - <NoStdLib>false</NoStdLib> - <NoWarn /> - <Optimize>true</Optimize> - <RegisterForComInterop>false</RegisterForComInterop> - <RemoveIntegerChecks>false</RemoveIntegerChecks> - <TreatWarningsAsErrors>false</TreatWarningsAsErrors> - <WarningLevel>4</WarningLevel> - <DebugType>pdbonly</DebugType> - <ErrorReport>prompt</ErrorReport> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release35|AnyCPU' "> - <TargetFrameworkVersion>v3.5</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <OutputPath>..\..\bin\contrib\WordNet\</OutputPath> - <AllowUnsafeBlocks>false</AllowUnsafeBlocks> - <BaseAddress>285212672</BaseAddress> - <CheckForOverflowUnderflow>false</CheckForOverflowUnderflow> - <ConfigurationOverrideFile /> - <DefineConstants>TRACE;$(Framework)</DefineConstants> - <DocumentationFile></DocumentationFile> - <DebugSymbols>true</DebugSymbols> - <FileAlignment>4096</FileAlignment> - <NoStdLib>false</NoStdLib> - <NoWarn /> - <Optimize>true</Optimize> - <RegisterForComInterop>false</RegisterForComInterop> - <RemoveIntegerChecks>false</RemoveIntegerChecks> - <TreatWarningsAsErrors>false</TreatWarningsAsErrors> - <WarningLevel>4</WarningLevel> - <DebugType>pdbonly</DebugType> - <ErrorReport>prompt</ErrorReport> - <OutputType>Library</OutputType> - </PropertyGroup> - <ItemGroup> - <Reference Include="System"> - <Name>System</Name> - </Reference> - <Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" /> - <Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" /> - <Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" /> - <Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" /> - <Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" /> - <Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" /> - </ItemGroup> - <ItemGroup> - <Compile Include="AssemblyInfo.cs"> - <SubType>Code</SubType> - </Compile> - <Compile Include="Syns2Index.cs"> - <SubType>Code</SubType> - </Compile> - <Content Include="App.ico" /> - </ItemGroup> - <ItemGroup> - <ProjectReference Include="..\..\..\core\Lucene.Net.csproj"> - <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project> - <Name>Lucene.Net</Name> - </ProjectReference> - </ItemGroup> - <ItemGroup> - <BootstrapperPackage Include=".NETFramework,Version=v4.0"> - <Visible>False</Visible> - <ProductName>Microsoft .NET Framework 4 %28x86 and x64%29</ProductName> - <Install>true</Install> - </BootstrapperPackage> - <BootstrapperPackage Include="Microsoft.Net.Client.3.5"> - <Visible>False</Visible> - <ProductName>.NET Framework 3.5 SP1 Client Profile</ProductName> - <Install>false</Install> - </BootstrapperPackage> - <BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1"> - <Visible>False</Visible> - <ProductName>.NET Framework 3.5 SP1</ProductName> - <Install>false</Install> - </BootstrapperPackage> - <BootstrapperPackage Include="Microsoft.Windows.Installer.3.1"> - <Visible>False</Visible> - <ProductName>Windows Installer 3.1</ProductName> - <Install>true</Install> - </BootstrapperPackage> - </ItemGroup> - <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> - <PropertyGroup> - <PreBuildEvent /> - <PostBuildEvent /> - </PropertyGroup> -</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/WordNet/Syns2Index/Syns2Index.cs ---------------------------------------------------------------------- diff --git a/src/contrib/WordNet/Syns2Index/Syns2Index.cs b/src/contrib/WordNet/Syns2Index/Syns2Index.cs deleted file mode 100644 index da96a8a..0000000 --- a/src/contrib/WordNet/Syns2Index/Syns2Index.cs +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using Lucene.Net.Store; -using Analyzer = Lucene.Net.Analysis.Analyzer; -using Directory = System.IO.Directory; -using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer; -using Document = Lucene.Net.Documents.Document; -using Field = Lucene.Net.Documents.Field; -using IndexWriter = Lucene.Net.Index.IndexWriter; - -namespace WorldNet.Net -{ - - /// <summary> Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a> - /// into a Lucene index suitable for looking up synonyms and performing query expansion (<see cref="SynExpand.Expand"/>). - /// - /// This has been tested with WordNet 2.0. - /// - /// The index has fields named "word" (<see cref="F_WORD"/>) - /// and "syn" (<see cref="F_SYN"/>). - /// <p> - /// The source word (such as 'big') can be looked up in the - /// "word" field, and if present there will be fields named "syn" - /// for every synonym. What's tricky here is that there could be <b>multiple</b> - /// fields with the same name, in the general case for words that have multiple synonyms. - /// That's not a problem with Lucene, you just use <see cref="Document.GetValues"/> - /// </p> - /// <p> - /// While the WordNet file distinguishes groups of synonyms with - /// related meanings we don't do that here. - /// </p> - /// This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB. - /// </summary> - /// - /// <seealso cref="http://www.cogsci.princeton.edu/~wn/"></seealso> - /// <seealso cref="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html"></seealso> - /// <seealso cref="http://www.hostmon.com/rfc/advanced.jsp"> </seealso> - public class Syns2Index - { - /// <summary> </summary> - private static readonly System.IO.StreamWriter o; - - /// <summary> </summary> - private static readonly System.IO.StreamWriter err; - - /// <summary> </summary> - public const System.String F_SYN = "syn"; - - /// <summary> </summary> - public const System.String F_WORD = "word"; - - /// <summary> </summary> - private static readonly Analyzer ana = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); - - /// <summary> - /// Takes arg of prolog file name and index directory. - /// </summary> - [STAThread] - public static void Main(System.String[] args) - { - // get command line arguments - String prologFilename = null; // name of file "wn_s.pl" - String indexDir = null; - if (args.Length == 2) - { - prologFilename = args[0]; - indexDir = args[1]; - } - else - { - Usage(); - Environment.Exit(1); - } - - // ensure that the prolog file is readable - if (!(new FileInfo(prologFilename)).Exists) - { - err.WriteLine("Error: cannot read Prolog file: " + prologFilename); - Environment.Exit(1); - } - // exit if the target index directory already exists - if (Directory.Exists((new FileInfo(indexDir)).FullName)) - { - err.WriteLine("Error: index directory already exists: " + indexDir); - err.WriteLine("Please specify a name of a non-existent directory"); - Environment.Exit(1); - } - - o.WriteLine("Opening Prolog file " + prologFilename); - var fis = new FileStream(prologFilename, FileMode.Open, FileAccess.Read); - var br = new StreamReader(new StreamReader(fis, System.Text.Encoding.Default).BaseStream, new StreamReader(fis, System.Text.Encoding.Default).CurrentEncoding); - String line; - - // maps a word to all the "groups" it's in - System.Collections.IDictionary word2Nums = new System.Collections.SortedList(); - // maps a group to all the words in it - System.Collections.IDictionary num2Words = new System.Collections.SortedList(); - // number of rejected words - var ndecent = 0; - - // status output - var mod = 1; - var row = 1; - // parse prolog file - o.WriteLine("[1/2] Parsing " + prologFilename); - while ((line = br.ReadLine()) != null) - { - // occasional progress - if ((++row) % mod == 0) // periodically print out line we read in - { - mod *= 2; - o.WriteLine("\t" + row + " " + line + " " + word2Nums.Count + " " + num2Words.Count + " ndecent=" + ndecent); - } - - // syntax check - if (!line.StartsWith("s(")) - { - err.WriteLine("OUCH: " + line); - Environment.Exit(1); - } - - // parse line - line = line.Substring(2); - var comma = line.IndexOf(','); - var num = line.Substring(0, comma); - var q1 = line.IndexOf('\''); - line = line.Substring(q1 + 1); - var q2 = line.IndexOf('\''); - var word = line.Substring(0, q2).ToLower().Replace("''", "'"); - - // make sure is a normal word - if (!IsDecent(word)) - { - ndecent++; - continue; // don't store words w/ spaces - } - - // 1/2: word2Nums map - // append to entry or add new one - var lis = (System.Collections.IList) word2Nums[word]; - if (lis == null) - { - lis = new List<String> {num}; - word2Nums[word] = lis; - } - else - lis.Add(num); - - // 2/2: num2Words map - lis = (System.Collections.IList) num2Words[num]; - if (lis == null) - { - lis = new List<String> { word }; - num2Words[num] = lis; - } - else - lis.Add(word); - } - - // close the streams - fis.Close(); - br.Close(); - - // create the index - o.WriteLine("[2/2] Building index to store synonyms, " + " map sizes are " + word2Nums.Count + " and " + num2Words.Count); - Index(indexDir, word2Nums, num2Words); - } - - /// <summary> - /// Checks to see if a word contains only alphabetic characters by - /// checking it one character at a time. - /// </summary> - /// <param name="s">string to check </param> - /// <returns> <c>true</c> if the string is decent</returns> - private static bool IsDecent(String s) - { - var len = s.Length; - for (var i = 0; i < len; i++) - { - if (!Char.IsLetter(s[i])) - { - return false; - } - } - return true; - } - - /// <summary> - /// Forms a Lucene index based on the 2 maps. - /// </summary> - /// <param name="indexDir">the direcotry where the index should be created</param> - /// <param name="word2Nums">word2Nums</param> - /// <param name="num2Words">num2Words</param> - private static void Index(String indexDir, System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words) - { - var row = 0; - var mod = 1; - - using (var dir = FSDirectory.Open(new DirectoryInfo(indexDir))) - { - var writer = new IndexWriter(dir, ana, true, IndexWriter.MaxFieldLength.LIMITED); - writer.UseCompoundFile = true; // why? - - var i1 = word2Nums.Keys.GetEnumerator(); - while (i1.MoveNext()) - { - var g = (String)i1.Current; - var doc = new Document(); - - var n = Index(word2Nums, num2Words, g, doc); - if (n > 0) - { - doc.Add(new Field(F_WORD, g, Field.Store.YES, Field.Index.NOT_ANALYZED)); - if ((++row % mod) == 0) - { - o.WriteLine("\trow=" + row + "/" + word2Nums.Count + " doc= " + doc); - mod *= 2; - } - writer.AddDocument(doc); - } - } - o.WriteLine("Optimizing.."); - writer.Optimize(); - writer.Close(); - } - - } - - /// <summary> - /// Given the 2 maps fills a document for 1 word. - /// </summary> - private static int Index(System.Collections.IDictionary word2Nums, System.Collections.IDictionary num2Words, System.String g, Document doc) - { - var keys = (System.Collections.IList) word2Nums[g]; // get list of key#'s - var i2 = keys.GetEnumerator(); - - var already = new System.Collections.SortedList(); // keep them sorted - - // pass 1: fill up 'already' with all words - while (i2.MoveNext()) // for each key# - { - foreach (var item in - ((System.Collections.IList) num2Words[i2.Current]).Cast<object>().Where(item => already.Contains(item) == false)) - { - already.Add(item, item); - } - } - - var num = 0; - already.Remove(g); // of course a word is it's own syn - var it = already.GetEnumerator(); - while (it.MoveNext()) - { - var cur = (String) it.Key; - // don't store things like 'pit bull' -> 'american pit bull' - if (!IsDecent(cur)) - { - continue; - } - num++; - doc.Add(new Field(F_SYN, cur, Field.Store.YES, Field.Index.NO)); - } - return num; - } - - /// <summary> </summary> - private static void Usage() - { - o.WriteLine("\n\n" + typeof(Syns2Index) + " <prolog file> <index dir>\n\n"); - } - - } -} \ No newline at end of file
