Lucene.Net.Analysis.Stempel: Modified Egothor.Stemmer Compile and DiffIt programs to accept file encoding on the command line and cleaned up implementation
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/933d8351 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/933d8351 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/933d8351 Branch: refs/heads/master Commit: 933d8351154e2a40f5ad226d8a96172c1401d1cd Parents: 775df65 Author: Shad Storhaug <[email protected]> Authored: Thu Jul 6 18:49:47 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Thu Jul 6 18:49:47 2017 +0700 ---------------------------------------------------------------------- .../Egothor.Stemmer/Compile.cs | 33 ++++++-- .../Egothor.Stemmer/DiffIt.cs | 86 +++++++++++--------- 2 files changed, 72 insertions(+), 47 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/933d8351/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs index a6d8315..9bbfa71 100644 --- a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs +++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/Compile.cs @@ -1,6 +1,7 @@ using Lucene.Net.Support; using Lucene.Net.Support.IO; using System; +using System.Collections.Generic; using System.IO; using System.Text; @@ -90,8 +91,9 @@ namespace Egothor.Stemmer return; } - args[0].ToUpperInvariant(); + args[0] = args[0].ToUpperInvariant(); + // Reads the first char of the first arg backward = args[0][0] == '-'; int qq = (backward) ? 1 : 0; bool storeorig = false; @@ -109,6 +111,7 @@ namespace Egothor.Stemmer } string charset = null; + var stemmerTables = new List<string>(); try { charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset"); @@ -124,15 +127,28 @@ namespace Egothor.Stemmer } } + // LUCENENET specific + // command line argument overrides environment variable or default, if supplied + for (int i = 1; i < args.Length; i++) + { + if ("-e".Equals(args[i]) || "--encoding".Equals(args[i])) + { + charset = args[i]; + } + else + { + stemmerTables.Add(args[i]); + } + } + char[] optimizer = new char[args[0].Length - qq]; for (int i = 0; i < optimizer.Length; i++) { optimizer[i] = args[0][qq + i]; } - for (int i = 1; i < args.Length; i++) + foreach (var stemmerTable in stemmerTables) { - TextReader @in; // System.out.println("[" + args[i] + "]"); Diff diff = new Diff(); //int stems = 0; // not used @@ -141,11 +157,12 @@ namespace Egothor.Stemmer AllocTrie(); - Console.WriteLine(args[i]); - using (@in = new StreamReader( - new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset))) + Console.WriteLine(stemmerTable); + using (TextReader input = new StreamReader( + new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset))) { - for (string line = @in.ReadLine(); line != null; line = @in.ReadLine()) + string line; + while ((line = input.ReadLine()) != null) { try { @@ -212,7 +229,7 @@ namespace Egothor.Stemmer } using (DataOutputStream os = new DataOutputStream( - new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write))) + new FileStream(stemmerTable + ".out", FileMode.OpenOrCreate, FileAccess.Write))) { os.WriteUTF(args[0]); trie.Store(os); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/933d8351/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs index 5a1c9bc..4d29472 100644 --- a/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs +++ b/src/Lucene.Net.Analysis.Stempel/Egothor.Stemmer/DiffIt.cs @@ -1,5 +1,6 @@ using Lucene.Net.Support; using System; +using System.Collections.Generic; using System.IO; using System.Text; @@ -80,14 +81,6 @@ namespace Egothor.Stemmer } return result; - //try - //{ - // return int.parseInt(s.substring(i, i + 1)); - //} - //catch (Exception /*x*/) - //{ - // return 1; - //} } /// <summary> @@ -101,56 +94,71 @@ namespace Egothor.Stemmer /// <param name="args">the path to a file containing a stemmer table</param> public static void Main(string[] args) { - - int ins = Get(0, args[0]); int del = Get(1, args[0]); int rep = Get(2, args[0]); int nop = Get(3, args[0]); - for (int i = 1; i < args.Length; i++) + string charset = null; + var stemmerTables = new List<string>(); + try { - TextReader @in; - // System.out.println("[" + args[i] + "]"); - Diff diff = new Diff(ins, del, rep, nop); - - string charset = null; - try + charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset"); + } + catch + { + } + finally + { + if (string.IsNullOrEmpty(charset)) { - charset = System.Environment.GetEnvironmentVariable("egothor.stemmer.charset"); + charset = "UTF-8"; } - catch + } + + // LUCENENET specific + // command line argument overrides environment variable or default, if supplied + for (int i = 1; i < args.Length; i++) + { + if ("-e".Equals(args[i]) || "--encoding".Equals(args[i])) { + charset = args[i]; } - finally + else { - if (string.IsNullOrEmpty(charset)) - { - charset = "UTF-8"; - } + stemmerTables.Add(args[i]); } + } + + foreach (var stemmerTable in stemmerTables) + { + // System.out.println("[" + args[i] + "]"); + Diff diff = new Diff(ins, del, rep, nop); - @in = new StreamReader(new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset)); - for (string line = @in.ReadLine(); line != null; line = @in.ReadLine()) + using (TextReader input = new StreamReader(new FileStream(stemmerTable, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(charset))) { - try + string line; + while ((line = input.ReadLine()) != null) { - line = line.ToLowerInvariant(); - StringTokenizer st = new StringTokenizer(line); - string stem = st.NextToken(); - Console.WriteLine(stem + " -a"); - while (st.HasMoreTokens()) + try { - String token = st.NextToken(); - if (token.Equals(stem) == false) + line = line.ToLowerInvariant(); + StringTokenizer st = new StringTokenizer(line); + string stem = st.NextToken(); + Console.WriteLine(stem + " -a"); + while (st.HasMoreTokens()) { - Console.WriteLine(stem + " " + diff.Exec(token, stem)); + string token = st.NextToken(); + if (token.Equals(stem) == false) + { + Console.WriteLine(stem + " " + diff.Exec(token, stem)); + } } } - } - catch (InvalidOperationException /*x*/) - { - // no base token (stem) on a line + catch (InvalidOperationException /*x*/) + { + // no base token (stem) on a line + } } } }
