lucene-cli: Added command for Kuromoji DictionaryBuilder tool + tests + documentation
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/bacfcc1a Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/bacfcc1a Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/bacfcc1a Branch: refs/heads/master Commit: bacfcc1adbe0fa46bbc5a3ba1d657258cb9c571d Parents: 0f09201 Author: Shad Storhaug <[email protected]> Authored: Mon Jul 24 00:13:23 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Mon Jul 24 00:35:28 2017 +0700 ---------------------------------------------------------------------- ...nalysisKuromojiBuildDictionaryCommandTest.cs | 104 +++++++++++++++++++ .../lucene-cli/Resources/Strings.Designer.cs | 54 ++++++++++ src/tools/lucene-cli/Resources/Strings.resx | 18 ++++ .../commands/analysis/AnalysisCommand.cs | 2 +- .../AnalysisKuromojiBuildDictionaryCommand.cs | 95 +++++++++++++++++ src/tools/lucene-cli/docs/analysis/index.md | 1 + .../docs/analysis/kuromoji-build-dictionary.md | 46 ++++++++ src/tools/lucene-cli/project.json | 3 +- 8 files changed, 321 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs ---------------------------------------------------------------------- diff --git a/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs b/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs new file mode 100644 index 0000000..c8eaa41 --- /dev/null +++ b/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs @@ -0,0 +1,104 @@ +using Lucene.Net.Attributes; +using Lucene.Net.Cli.CommandLine; +using NUnit.Framework; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Cli.Commands +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class AnalysisKuromojiBuildDictionaryCommandTest : CommandTestCase + { + protected override ConfigurationBase CreateConfiguration(MockConsoleApp app) + { + return new AnalysisKuromojiBuildDictionaryCommand.Configuration(new CommandLineOptions()) { Main = (args) => app.Main(args) }; + } + + protected override IList<Arg[]> GetOptionalArgs() + { + // NOTE: We must order this in the sequence of the expected output. + return new List<Arg[]>() + { + new Arg[] { new Arg(inputPattern: "-e UTF-16|--encoding UTF-16", output: new string[] { "--encoding", "UTF-16" }) }, + new Arg[] { new Arg(inputPattern: "-n|--normalize", output: new string[] { "true" }) }, + }; + } + protected override IList<Arg[]> GetRequiredArgs() + { + // NOTE: We must order this in the sequence of the expected output. + return new List<Arg[]>() + { + new Arg[] { new Arg(inputPattern: "epidic", output: new string[] { @"epidic" }) }, + new Arg[] { new Arg(inputPattern: @"C:\lucene-input", output: new string[] { @"C:\lucene-input" }) }, + new Arg[] { new Arg(inputPattern: @"C:\lucene-output", output: new string[] { @"C:\lucene-output" }) }, + }; + } + + [Test] + [LuceneNetSpecific] + public override void TestAllValidCombinations() + { + var requiredArgs = GetRequiredArgs().ExpandArgs().RequiredParameters(); + var optionalArgs = GetOptionalArgs().ExpandArgs().OptionalParameters(); + + foreach (var requiredArg in requiredArgs) + { + AssertCommandTranslation( + string.Join(" ", requiredArg.Select(x => x.InputPattern).ToArray()), + requiredArg.SelectMany(x => x.Output) + + .Concat(new string[] { + // Special case: the encoding must always be supplied + "utf-8", + // Special case: normalize must always be supplied + "false" + }).ToArray()); + } + + foreach (var requiredArg in requiredArgs) + { + foreach (var optionalArg in optionalArgs) + { + string command = string.Join(" ", requiredArg.Select(x => x.InputPattern).Union(optionalArg.Select(x => x.InputPattern).ToArray())); + string[] expected = requiredArg.SelectMany(x => x.Output) + // Special case: the encoding must always be supplied + .Concat(Regex.IsMatch(command, "-e|--encoding") ? new string[] { "UTF-16" } : new string[] { "utf-8" }) + // Special case: the encoding must always be supplied + .Concat(Regex.IsMatch(command, "-n|--normalize") ? new string[] { "true" } : new string[] { "false" }).ToArray(); + AssertCommandTranslation(command, expected); + } + } + } + + [Test] + [LuceneNetSpecific] + public virtual void TestNotEnoughArguments() + { + AssertConsoleOutput("one two", FromResource("NotEnoughArguments", 3)); + } + + [Test] + [LuceneNetSpecific] + public virtual void TestTooManyArguments() + { + Assert.Throws<CommandParsingException>(() => AssertConsoleOutput("one two three four", "")); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/Resources/Strings.Designer.cs ---------------------------------------------------------------------- diff --git a/src/tools/lucene-cli/Resources/Strings.Designer.cs b/src/tools/lucene-cli/Resources/Strings.Designer.cs index 5d1fa93..9af44ff 100644 --- a/src/tools/lucene-cli/Resources/Strings.Designer.cs +++ b/src/tools/lucene-cli/Resources/Strings.Designer.cs @@ -69,6 +69,60 @@ namespace Lucene.Net.Cli.Resources { } /// <summary> + /// Looks up a localized string similar to Builds a custom dictionary that can be used by the JapaneseAnalyzer or JapaneseTokenizer.. + /// </summary> + public static string AnalysisKuromojiBuildDictionaryCommandDescription { + get { + return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandDescription", resourceCulture); + } + } + + /// <summary> + /// Looks up a localized string similar to The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed.. + /// </summary> + public static string AnalysisKuromojiBuildDictionaryCommandFormatDescription { + get { + return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandFormatDescription", resourceCulture); + } + } + + /// <summary> + /// Looks up a localized string similar to The directory where the dictionary input files are located.. + /// </summary> + public static string AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription { + get { + return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription", resourceCulture); + } + } + + /// <summary> + /// Looks up a localized string similar to The file encoding used by the input files. If not supplied, the default value is `UTF-8`.. + /// </summary> + public static string AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription { + get { + return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription", resourceCulture); + } + } + + /// <summary> + /// Looks up a localized string similar to Normalize the entries using normalization form KC.. + /// </summary> + public static string AnalysisKuromojiBuildDictionaryCommandNormalizeDescription { + get { + return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandNormalizeDescription", resourceCulture); + } + } + + /// <summary> + /// Looks up a localized string similar to The directory to put the dictionary output.. + /// </summary> + public static string AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription { + get { + return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription", resourceCulture); + } + } + + /// <summary> /// Looks up a localized string similar to Compiles a stemmer table for the Egothor stemmer.. /// </summary> public static string AnalysisStempelCompileStemsCommandDescription { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/Resources/Strings.resx ---------------------------------------------------------------------- diff --git a/src/tools/lucene-cli/Resources/Strings.resx b/src/tools/lucene-cli/Resources/Strings.resx index 64be738..727cb62 100644 --- a/src/tools/lucene-cli/Resources/Strings.resx +++ b/src/tools/lucene-cli/Resources/Strings.resx @@ -120,6 +120,24 @@ <data name="AnalysisCommandDescription" xml:space="preserve"> <value>Utilities to manage specialized analyzers.</value> </data> + <data name="AnalysisKuromojiBuildDictionaryCommandDescription" xml:space="preserve"> + <value>Builds a custom dictionary that can be used by the JapaneseAnalyzer or JapaneseTokenizer.</value> + </data> + <data name="AnalysisKuromojiBuildDictionaryCommandFormatDescription" xml:space="preserve"> + <value>The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed.</value> + </data> + <data name="AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription" xml:space="preserve"> + <value>The directory where the dictionary input files are located.</value> + </data> + <data name="AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription" xml:space="preserve"> + <value>The file encoding used by the input files. If not supplied, the default value is `UTF-8`.</value> + </data> + <data name="AnalysisKuromojiBuildDictionaryCommandNormalizeDescription" xml:space="preserve"> + <value>Normalize the entries using normalization form KC.</value> + </data> + <data name="AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription" xml:space="preserve"> + <value>The directory to put the dictionary output.</value> + </data> <data name="AnalysisStempelCompileStemsCommandDescription" xml:space="preserve"> <value>Compiles a stemmer table for the Egothor stemmer.</value> </data> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs ---------------------------------------------------------------------- diff --git a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs b/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs index 969bd58..a39eaeb 100644 --- a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs +++ b/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs @@ -27,7 +27,7 @@ this.Description = FromResource("Description"); //this.Commands.Add(new AnalysisICUBuildRBBIRulesCommand.Configuration(options)); - //this.Commands.Add(new AnalysisKuromojiBuildDictionaryCommand.Configuration(options)); + this.Commands.Add(new AnalysisKuromojiBuildDictionaryCommand.Configuration(options)); this.Commands.Add(new AnalysisStempelCompileStemsCommand.Configuration(options)); this.Commands.Add(new AnalysisStempelPatchStemsCommand.Configuration(options)); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs ---------------------------------------------------------------------- diff --git a/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs b/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs new file mode 100644 index 0000000..7f10ed7 --- /dev/null +++ b/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs @@ -0,0 +1,95 @@ +using Lucene.Net.Analysis.Ja.Util; +using Lucene.Net.Cli.CommandLine; +using System.Collections.Generic; + +namespace Lucene.Net.Cli +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class AnalysisKuromojiBuildDictionaryCommand : ICommand + { + public class Configuration : ConfigurationBase + { + public Configuration(CommandLineOptions options) + { + this.Main = (args) => DictionaryBuilder.Main(args); + + this.Name = "kuromoji-build-dictionary"; + this.Description = FromResource("Description"); + + this.Format = this.Argument( + "<FORMAT>", + FromResource("FormatDescription")); + this.InputDirectory = this.Argument( + "<INPUT_DIRECTORY>", + FromResource("InputDirectoryDescription")); + this.OutputDirectory = this.Argument( + "<OUTPUT_DIRECTORY>", + FromResource("OutputDirectoryDescription")); + this.InputDirectoryEncoding = this.Option( + "-e|--encoding <ENCODING>", + FromResource("InputDirectoryEncodingDescription"), + CommandOptionType.SingleValue); + this.Normalize = this.Option( + "-n|--normalize", + FromResource("NormalizeDescription"), + CommandOptionType.NoValue); + + this.OnExecute(() => new AnalysisKuromojiBuildDictionaryCommand().Run(this)); + } + + public virtual CommandArgument Format { get; private set; } + public virtual CommandArgument InputDirectory { get; private set; } + public virtual CommandArgument OutputDirectory { get; private set; } + public virtual CommandOption InputDirectoryEncoding { get; private set; } + public virtual CommandOption Normalize { get; private set; } + } + + public int Run(ConfigurationBase cmd) + { + if (!cmd.ValidateArguments(3)) + { + return 1; + } + + var input = cmd as Configuration; + var args = new List<string>(input.GetNonNullArguments()); + + if (input.InputDirectoryEncoding.HasValue()) + { + args.Add(input.InputDirectoryEncoding.Value()); + } + else + { + args.Add("utf-8"); + } + + if (input.Normalize.HasValue()) + { + args.Add("true"); + } + else + { + args.Add("false"); + } + + cmd.Main(args.ToArray()); + return 0; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/docs/analysis/index.md ---------------------------------------------------------------------- diff --git a/src/tools/lucene-cli/docs/analysis/index.md b/src/tools/lucene-cli/docs/analysis/index.md index c114294..9843805 100644 --- a/src/tools/lucene-cli/docs/analysis/index.md +++ b/src/tools/lucene-cli/docs/analysis/index.md @@ -6,5 +6,6 @@ Utilities to manage specialized analyzers. ## Commands +- [kuromoji-build-dictionary](kuromoji-build-dictionary.md) - [stempel-compile-stems](stempel-compile-stems.md) - [stempel-patch-stems](stempel-patch-stems.md) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md ---------------------------------------------------------------------- diff --git a/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md b/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md new file mode 100644 index 0000000..9fd7cf6 --- /dev/null +++ b/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md @@ -0,0 +1,46 @@ +# kuromoji-build-dictionary + +### Name + +`analysis-kuromoji-build-dictionary` - Generates a dictionary file for the JapaneseAnalyzer or JapaneseTokenizer in the Lucene.Net.Analysis.Kuromoji project. + +### Synopsis + +<code>dotnet lucene-cli.dll analysis kuromoji-build-dictionary <FORMAT> <INPUT_DIRECTORY> <OUTPUT_DIRECTORY> [-e|--encoding] [-n|--normalize] [?|-h|--help]</code> + +### Description + +See the [Kuromoji project documentation](https://github.com/atilika/kuromoji) for more information. + +### Arguments + +`FORMAT` + +The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed. + +`INPUT_DIRECTORY` + +The directory where the dictionary input files are located. + +`OUTPUT_DIRECTORY` + +The directory to put the dictionary output. + +### Options + +`?|-h|--help` + +Prints out a short help for the command. + +`-e|--encoding <ENCODING>` + +The file encoding used by the input files. If not supplied, the default value is `UTF-8`. + +`-n|--normalize` + +Normalize the entries using normalization form KC. + +### Example + +<code>dotnet lucene-cli.dll analysis kuromoji-build-dictionary X:\kuromoji-data X:\kuromoji-dictionary --encoding UTF-16</code> + http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/project.json ---------------------------------------------------------------------- diff --git a/src/tools/lucene-cli/project.json b/src/tools/lucene-cli/project.json index 219964d..767a705 100644 --- a/src/tools/lucene-cli/project.json +++ b/src/tools/lucene-cli/project.json @@ -1,4 +1,4 @@ -{ +{ "version": "4.8.0", "entryPoint": "Program", "buildOptions": { @@ -19,6 +19,7 @@ "dependencies": { "Lucene.Net": "4.8.0", "Lucene.Net.Analysis.Common": "4.8.0", + "Lucene.Net.Analysis.Kuromoji": "4.8.0", "Lucene.Net.Analysis.Stempel": "4.8.0", "Lucene.Net.Demo": "4.8.0", "Lucene.Net.Expressions": "4.8.0",
