Ported Lucene.Net.Benchmark + tests
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b515271d Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b515271d Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b515271d Branch: refs/heads/master Commit: b515271d8821dde3cd980beae780d204fd6b0e5c Parents: 1e52293 Author: Shad Storhaug <[email protected]> Authored: Mon Jul 31 14:26:48 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Wed Aug 2 09:54:52 2017 +0700 ---------------------------------------------------------------------- Lucene.Net.sln | 52 + src/Lucene.Net.Benchmark/ByTask/Benchmark.cs | 170 +++ .../ByTask/Feeds/AbstractQueryMaker.cs | 85 ++ .../ByTask/Feeds/ContentItemsSource.cs | 227 ++++ .../ByTask/Feeds/ContentSource.cs | 38 + .../ByTask/Feeds/DemoHTMLParser.cs | 259 ++++ .../ByTask/Feeds/DirContentSource.cs | 259 ++++ .../ByTask/Feeds/DocData.cs | 73 ++ .../ByTask/Feeds/DocMaker.cs | 511 ++++++++ .../ByTask/Feeds/EnwikiContentSource.cs | 394 ++++++ .../ByTask/Feeds/EnwikiQueryMaker.cs | 146 +++ .../ByTask/Feeds/FacetSource.cs | 47 + .../ByTask/Feeds/FileBasedQueryMaker.cs | 121 ++ .../ByTask/Feeds/GeonamesLineParser.cs | 53 + .../ByTask/Feeds/HTMLParser.cs | 42 + .../ByTask/Feeds/LineDocSource.cs | 328 +++++ .../ByTask/Feeds/LongToEnglishContentSource.cs | 72 ++ .../ByTask/Feeds/LongToEnglishQueryMaker.cs | 89 ++ .../ByTask/Feeds/NoMoreDataException.cs | 50 + .../ByTask/Feeds/QueryMaker.cs | 48 + .../ByTask/Feeds/RandomFacetSource.cs | 109 ++ .../ByTask/Feeds/ReutersContentSource.cs | 140 +++ .../ByTask/Feeds/ReutersQueryMaker.cs | 126 ++ .../ByTask/Feeds/SimpleQueryMaker.cs | 70 ++ .../Feeds/SimpleSloppyPhraseQueryMaker.cs | 88 ++ .../ByTask/Feeds/SingleDocSource.cs | 77 ++ .../ByTask/Feeds/SortableSingleDocSource.cs | 114 ++ .../ByTask/Feeds/SpatialDocMaker.cs | 249 ++++ .../ByTask/Feeds/SpatialFileQueryMaker.cs | 131 ++ .../ByTask/Feeds/TrecContentSource.cs | 350 ++++++ .../ByTask/Feeds/TrecDocParser.cs | 159 +++ .../ByTask/Feeds/TrecFBISParser.cs | 68 + .../ByTask/Feeds/TrecFR94Parser.cs | 69 + .../ByTask/Feeds/TrecFTParser.cs | 58 + .../ByTask/Feeds/TrecGov2Parser.cs | 57 + .../ByTask/Feeds/TrecLATimesParser.cs | 75 ++ .../ByTask/Feeds/TrecParserByPath.cs | 34 + src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs | 490 ++++++++ .../ByTask/Programmatic/Sample.cs | 90 ++ src/Lucene.Net.Benchmark/ByTask/Stats/Points.cs | 108 ++ src/Lucene.Net.Benchmark/ByTask/Stats/Report.cs | 70 ++ .../ByTask/Stats/TaskStats.cs | 237 ++++ .../ByTask/Tasks/AddDocTask.cs | 93 ++ .../ByTask/Tasks/AddFacetedDocTask.cs | 95 ++ .../ByTask/Tasks/AddIndexesTask.cs | 104 ++ .../ByTask/Tasks/AnalyzerFactoryTask.cs | 580 +++++++++ .../ByTask/Tasks/BenchmarkHighlighter.cs | 32 + .../ByTask/Tasks/ClearStatsTask.cs | 44 + .../ByTask/Tasks/CloseIndexTask.cs | 67 + .../ByTask/Tasks/CloseReaderTask.cs | 49 + .../ByTask/Tasks/CloseTaxonomyIndexTask.cs | 42 + .../ByTask/Tasks/CloseTaxonomyReaderTask.cs | 47 + .../ByTask/Tasks/CommitIndexTask.cs | 62 + .../ByTask/Tasks/CommitTaxonomyIndexTask.cs | 48 + .../ByTask/Tasks/ConsumeContentSourceTask.cs | 48 + .../ByTask/Tasks/CreateIndexTask.cs | 225 ++++ .../ByTask/Tasks/CreateTaxonomyIndexTask.cs | 42 + .../ByTask/Tasks/ForceMergeTask.cs | 61 + .../ByTask/Tasks/NearRealtimeReaderTask.cs | 132 ++ .../ByTask/Tasks/NewAnalyzerTask.cs | 189 +++ .../ByTask/Tasks/NewCollationAnalyzerTask.cs | 149 +++ .../ByTask/Tasks/NewLocaleTask.cs | 97 ++ .../ByTask/Tasks/NewRoundTask.cs | 44 + .../ByTask/Tasks/OpenIndexTask.cs | 88 ++ .../ByTask/Tasks/OpenReaderTask.cs | 100 ++ .../ByTask/Tasks/OpenTaxonomyIndexTask.cs | 41 + .../ByTask/Tasks/OpenTaxonomyReaderTask.cs | 44 + .../ByTask/Tasks/PerfTask.cs | 380 ++++++ .../ByTask/Tasks/PrintReaderTask.cs | 60 + .../ByTask/Tasks/ReadTask.cs | 339 +++++ .../ByTask/Tasks/ReadTokensTask.cs | 160 +++ .../ByTask/Tasks/ReopenReaderTask.cs | 45 + .../ByTask/Tasks/RepAllTask.cs | 83 ++ .../ByTask/Tasks/RepSelectByPrefTask.cs | 81 ++ .../ByTask/Tasks/RepSumByNameRoundTask.cs | 83 ++ .../ByTask/Tasks/RepSumByNameTask.cs | 81 ++ .../ByTask/Tasks/RepSumByPrefRoundTask.cs | 79 ++ .../ByTask/Tasks/RepSumByPrefTask.cs | 91 ++ .../ByTask/Tasks/ReportTask.cs | 189 +++ .../ByTask/Tasks/ResetInputsTask.cs | 43 + .../ByTask/Tasks/ResetSystemEraseTask.cs | 42 + .../ByTask/Tasks/ResetSystemSoftTask.cs | 41 + .../ByTask/Tasks/RollbackIndexTask.cs | 52 + .../ByTask/Tasks/SearchTask.cs | 60 + .../ByTask/Tasks/SearchTravRetHighlightTask.cs | 188 +++ .../Tasks/SearchTravRetLoadFieldSelectorTask.cs | 85 ++ .../ByTask/Tasks/SearchTravRetTask.cs | 44 + .../Tasks/SearchTravRetVectorHighlightTask.cs | 191 +++ .../ByTask/Tasks/SearchTravTask.cs | 87 ++ .../ByTask/Tasks/SearchWithCollectorTask.cs | 99 ++ .../ByTask/Tasks/SearchWithSortTask.cs | 157 +++ .../ByTask/Tasks/SetPropTask.cs | 71 ++ .../ByTask/Tasks/TaskSequence.cs | 662 ++++++++++ .../ByTask/Tasks/UpdateDocTask.cs | 99 ++ .../ByTask/Tasks/WaitForMergesTask.cs | 36 + .../ByTask/Tasks/WaitTask.cs | 89 ++ .../ByTask/Tasks/WarmTask.cs | 64 + .../ByTask/Tasks/WriteEnwikiLineDocTask.cs | 72 ++ .../ByTask/Tasks/WriteLineDocTask.cs | 238 ++++ .../ByTask/Utils/Algorithm.cs | 459 +++++++ .../ByTask/Utils/AnalyzerFactory.cs | 156 +++ src/Lucene.Net.Benchmark/ByTask/Utils/Config.cs | 559 +++++++++ .../ByTask/Utils/FileUtils.cs | 46 + src/Lucene.Net.Benchmark/ByTask/Utils/Format.cs | 109 ++ .../ByTask/Utils/StreamUtils.cs | 132 ++ src/Lucene.Net.Benchmark/Constants.cs | 33 + .../Lucene.Net.Benchmark.csproj | 214 ++++ .../Lucene.Net.Benchmark.project.json | 15 + .../Properties/AssemblyInfo.cs | 30 + src/Lucene.Net.Benchmark/Quality/Judge.cs | 55 + .../Quality/QualityBenchmark.cs | 159 +++ .../Quality/QualityQuery.cs | 107 ++ .../Quality/QualityQueryParser.cs | 35 + .../Quality/QualityStats.cs | 339 +++++ .../Quality/Trec/QueryDriver.cs | 93 ++ .../Quality/Trec/Trec1MQReader.cs | 92 ++ .../Quality/Trec/TrecJudge.cs | 186 +++ .../Quality/Trec/TrecTopicsReader.cs | 154 +++ .../Quality/Utils/DocNameExtractor.cs | 89 ++ .../Quality/Utils/QualityQueriesFinder.cs | 152 +++ .../Quality/Utils/SimpleQQParser.cs | 76 ++ .../Quality/Utils/SubmissionReport.cs | 98 ++ .../Utils/ExtractReuters.cs | 167 +++ .../Utils/ExtractWikipedia.cs | 178 +++ src/Lucene.Net.Benchmark/project.json | 53 + src/Lucene.Net.TestFramework/Util/TestUtil.cs | 22 +- .../BenchmarkTestCase.cs | 129 ++ .../ByTask/Feeds/DocMakerTest.cs | 193 +++ .../ByTask/Feeds/EnwikiContentSourceTest.cs | 194 +++ .../ByTask/Feeds/LineDocSourceTest.cs | 271 ++++ .../ByTask/Feeds/TestHtmlParser.cs | 164 +++ .../ByTask/Feeds/TrecContentSourceTest.cs | 431 +++++++ .../ByTask/Feeds/trecdocs.zip | Bin 0 -> 2514 bytes .../ByTask/Tasks/AddIndexesTaskTest.cs | 153 +++ .../ByTask/Tasks/Alt/AltPackageTaskTest.cs | 68 + .../ByTask/Tasks/Alt/AltTestTask.cs | 35 + .../ByTask/Tasks/CommitIndexTaskTest.cs | 63 + .../ByTask/Tasks/CountingHighlighterTestTask.cs | 85 ++ .../ByTask/Tasks/CountingSearchTestTask.cs | 65 + .../ByTask/Tasks/CreateIndexTaskTest.cs | 129 ++ .../ByTask/Tasks/PerfTaskTest.cs | 81 ++ .../ByTask/Tasks/SearchWithSortTaskTest.cs | 35 + .../ByTask/Tasks/WriteEnwikiLineDocTaskTest.cs | 121 ++ .../ByTask/Tasks/WriteLineDocTaskTest.cs | 436 +++++++ .../ByTask/TestPerfTasksLogic.cs | 1177 ++++++++++++++++++ .../ByTask/TestPerfTasksParse.cs | 178 +++ .../ByTask/Utils/StreamUtilsTest.cs | 149 +++ .../ByTask/Utils/TestConfig.cs | 37 + src/Lucene.Net.Tests.Benchmark/ByTask/conf.zip | Bin 0 -> 40878 bytes .../ByTask/reuters.first20.lines.txt | 20 + .../test-mapping-ISOLatin1Accent-partial.txt | 30 + .../Conf/ConfLoader.cs | 28 + .../Lucene.Net.Tests.Benchmark.csproj | 129 ++ .../Lucene.Net.Tests.Benchmark.project.json | 13 + .../Properties/AssemblyInfo.cs | 36 + .../Quality/TestQualityRun.cs | 210 ++++ .../Quality/reuters.578.lines.txt.bz2 | Bin 0 -> 208314 bytes .../Quality/trecQRels.txt | 723 +++++++++++ .../Quality/trecTopics.txt | 287 +++++ .../Support/TestApiConsistency.cs | 150 +++ .../Support/TestExceptionSerialization.cs | 54 + src/Lucene.Net.Tests.Benchmark/project.json | 56 + 162 files changed, 22383 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/Lucene.Net.sln ---------------------------------------------------------------------- diff --git a/Lucene.Net.sln b/Lucene.Net.sln index 5450020..08a00a0 100644 --- a/Lucene.Net.sln +++ b/Lucene.Net.sln @@ -110,6 +110,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analysis.Kuromoj EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Analysis.Kuromoji", "src\Lucene.Net.Tests.Analysis.Kuromoji\Lucene.Net.Tests.Analysis.Kuromoji.csproj", "{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Benchmark", "src\Lucene.Net.Benchmark\Lucene.Net.Benchmark.csproj", "{EDC77CB4-597F-4818-8C83-3C006D12C384}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Benchmark", "src\Lucene.Net.Tests.Benchmark\Lucene.Net.Tests.Benchmark.csproj", "{9257F543-44E2-4DB6-8B27-A8A354C13E5B}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -1111,6 +1115,54 @@ Global {34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|Mixed Platforms.Build.0 = Release|Any CPU {34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|x86.ActiveCfg = Release|Any CPU {34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|x86.Build.0 = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|Any CPU.Build.0 = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|x86.ActiveCfg = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug|x86.Build.0 = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|Any CPU.Build.0 = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|x86.ActiveCfg = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Debug35|x86.Build.0 = Debug|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|Any CPU.ActiveCfg = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|Any CPU.Build.0 = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|x86.ActiveCfg = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release|x86.Build.0 = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|Any CPU.ActiveCfg = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|Any CPU.Build.0 = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|Mixed Platforms.Build.0 = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|x86.ActiveCfg = Release|Any CPU + {EDC77CB4-597F-4818-8C83-3C006D12C384}.Release35|x86.Build.0 = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|Any CPU.Build.0 = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|x86.ActiveCfg = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug|x86.Build.0 = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|Any CPU.Build.0 = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|x86.ActiveCfg = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Debug35|x86.Build.0 = Debug|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|Any CPU.ActiveCfg = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|Any CPU.Build.0 = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|x86.ActiveCfg = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release|x86.Build.0 = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|Any CPU.ActiveCfg = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|Any CPU.Build.0 = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|Mixed Platforms.Build.0 = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|x86.ActiveCfg = Release|Any CPU + {9257F543-44E2-4DB6-8B27-A8A354C13E5B}.Release35|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Benchmark.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Benchmark.cs b/src/Lucene.Net.Benchmark/ByTask/Benchmark.cs new file mode 100644 index 0000000..9f3ad70 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Benchmark.cs @@ -0,0 +1,170 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Run the benchmark algorithm. + /// </summary> + /// <remarks> + /// <list type="number"> + /// <item><description>Read algorithm.</description></item> + /// <item><description>Run the algorithm.</description></item> + /// </list> + /// <para/> + /// Things to be added/fixed in "Benchmarking by tasks": + /// <list type="number"> + /// <item><description>TODO - report into Excel and/or graphed view.</description></item> + /// <item><description>TODO - perf comparison between Lucene releases over the years.</description></item> + /// <item><description>TODO - perf report adequate to include in Lucene nightly build site? (so we can easily track performance changes.)</description></item> + /// <item><description>TODO - add overall time control for repeated execution (vs. current by-count only).</description></item> + /// <item><description>TODO - query maker that is based on index statistics.</description></item> + /// </list> + /// </remarks> + public class Benchmark + { + private PerfRunData runData; + private Algorithm algorithm; + private bool executed; + + public Benchmark(TextReader algReader) + { + // prepare run data + try + { + runData = new PerfRunData(new Config(algReader)); + } + catch (Exception e) + { + //e.printStackTrace(); + throw new Exception("Error: cannot init PerfRunData!", e); + } + + // parse algorithm + try + { + algorithm = new Algorithm(runData); + } + catch (Exception e) + { + throw new Exception("Error: cannot understand algorithm!", e); + } + } + + /// <summary> + /// Execute this benchmark. + /// </summary> + public virtual void Execute() + { + lock (this) + { + if (executed) + { + throw new InvalidOperationException("Benchmark was already executed"); + } + executed = true; + runData.SetStartTimeMillis(); + algorithm.Execute(); + } + } + + /// <summary> + /// Run the benchmark algorithm. + /// </summary> + /// <param name="args">Benchmark config and algorithm files.</param> + public static void Main(string[] args) + { + Exec(args); + } + + /// <summary> + /// Utility: execute benchmark from command line. + /// </summary> + /// <param name="args">Single argument is expected: algorithm-file.</param> + public static void Exec(string[] args) + { + // verify command line args + if (args.Length < 1) + { + SystemConsole.WriteLine("Usage: java Benchmark <algorithm file>"); + Environment.Exit(1); + } + + // verify input files + FileInfo algFile = new FileInfo(args[0]); + if (!algFile.Exists /*|| !algFile.isFile() ||!algFile.canRead()*/ ) + { + SystemConsole.WriteLine("cannot find/read algorithm file: " + algFile.FullName); + Environment.Exit(1); + } + + SystemConsole.WriteLine("Running algorithm from: " + algFile.FullName); + + Benchmark benchmark = null; + try + { + benchmark = new Benchmark(IOUtils.GetDecodingReader(algFile, Encoding.UTF8)); + } + catch (Exception e) + { + SystemConsole.WriteLine(e.ToString()); + Environment.Exit(1); + } + + SystemConsole.WriteLine("------------> algorithm:"); + SystemConsole.WriteLine(benchmark.Algorithm.ToString()); + + // execute + try + { + benchmark.Execute(); + } + catch (Exception e) + { + SystemConsole.WriteLine("Error: cannot execute the algorithm! " + e.Message); + SystemConsole.WriteLine(e.StackTrace); + } + + SystemConsole.WriteLine("####################"); + SystemConsole.WriteLine("### D O N E !!! ###"); + SystemConsole.WriteLine("####################"); + } + + /// <summary> + /// Returns the algorithm. + /// </summary> + public virtual Algorithm Algorithm + { + get { return algorithm; } + } + + /// <summary> + /// Returns the runData. + /// </summary> + public virtual PerfRunData RunData + { + get { return runData; } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/AbstractQueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/AbstractQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/AbstractQueryMaker.cs new file mode 100644 index 0000000..fb6a2bf --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/AbstractQueryMaker.cs @@ -0,0 +1,85 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Search; +using System; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Abstract base query maker. + /// Each query maker should just implement the <see cref="PrepareQueries()"/> method. + /// </summary> + public abstract class AbstractQueryMaker : IQueryMaker + { + protected int m_qnum = 0; + protected Query[] m_queries; + protected Config m_config; + + public virtual void ResetInputs() + { + m_qnum = 0; + } + + protected abstract Query[] PrepareQueries(); + + public virtual void SetConfig(Config config) + { + this.m_config = config; + m_queries = PrepareQueries(); + } + + public virtual string PrintQueries() + { + string newline = Environment.NewLine; + StringBuilder sb = new StringBuilder(); + if (m_queries != null) + { + for (int i = 0; i < m_queries.Length; i++) + { + sb.Append(i + ". " + m_queries[i].GetType().Name + " - " + m_queries[i].ToString()); + sb.Append(newline); + } + } + return sb.ToString(); + } + + public virtual Query MakeQuery() + { + return m_queries[NextQnum()]; + } + + // return next qnum + protected virtual int NextQnum() + { + lock (this) + { + int res = m_qnum; + m_qnum = (m_qnum + 1) % m_queries.Length; + return res; + } + } + + /// <seealso cref="IQueryMaker.MakeQuery(int)"/> + public virtual Query MakeQuery(int size) + { + throw new Exception(this + ".MakeQuery(int size) is not supported!"); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs new file mode 100644 index 0000000..c0f06ef --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs @@ -0,0 +1,227 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.IO; +using System.Reflection; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Base class for source of data for benchmarking. + /// </summary> + /// <remarks> + /// Keeps track of various statistics, such as how many data items were generated, + /// size in bytes etc. + /// <para/> + /// Supports the following configuration parameters: + /// <list type="bullet"> + /// <item><term>content.source.forever</term><description>specifies whether to generate items forever (<b>default=true</b>).</description></item> + /// <item><term>content.source.verbose</term><description>specifies whether messages should be output by the content source (<b>default=false</b>).</description></item> + /// <item><term>content.source.encoding</term><description> + /// specifies which encoding to use when + /// reading the files of that content source. Certain implementations may define + /// a default value if this parameter is not specified. (<b>default=null</b>). + /// </description></item> + /// <item><term>content.source.log.step</term><description> + /// specifies for how many items a + /// message should be logged. If set to 0 it means no logging should occur. + /// <b>NOTE:</b> if verbose is set to false, logging should not occur even if + /// logStep is not 0 (<b>default=0</b>). + /// </description></item> + /// </list> + /// </remarks> + public abstract class ContentItemsSource : IDisposable + { + private long bytesCount; + private long totalBytesCount; + private int itemCount; + private int totalItemCount; + private Config config; + + private int lastPrintedNumUniqueTexts = 0; + private long lastPrintedNumUniqueBytes = 0; + private int printNum = 0; + + protected bool m_forever; + protected int m_logStep; + protected bool m_verbose; + protected Encoding m_encoding; + + /// <summary>update count of bytes generated by this source</summary> + protected void AddBytes(long numBytes) + { + lock (this) + { + bytesCount += numBytes; + totalBytesCount += numBytes; + } + } + + /// <summary>update count of items generated by this source</summary> + protected void AddItem() + { + lock (this) + { + ++itemCount; + ++totalItemCount; + } + } + + /// <summary> + /// A convenience method for collecting all the files of a content source from + /// a given directory. The collected <see cref="FileInfo"/> instances are stored in the + /// given <paramref name="files"/>. + /// </summary> + protected void CollectFiles(DirectoryInfo dir, IList<FileInfo> files) + { + CollectFilesImpl(dir, files); + files.Sort(new FileNameComparer()); + } + + private void CollectFilesImpl(DirectoryInfo dir, IList<FileInfo> files) + { + foreach (var sub in dir.EnumerateDirectories()) + { + CollectFilesImpl(sub, files); + } + + files.AddRange(dir.GetFiles()); + } + + private class FileNameComparer : IComparer<FileInfo> + { + public int Compare(FileInfo x, FileInfo y) + { + return x.FullName.CompareToOrdinal(y.FullName); + } + } + + /// <summary> + /// Returns <c>true</c> whether it's time to log a message (depending on verbose and + /// the number of items generated). + /// </summary> + /// <returns></returns> + protected bool ShouldLog() + { + return m_verbose && m_logStep > 0 && itemCount % m_logStep == 0; + } + + /// <summary>Called when reading from this content source is no longer required.</summary> + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + /// <summary>Called when reading from this content source is no longer required.</summary> + protected abstract void Dispose(bool disposing); + + + /// <summary>Returns the number of bytes generated since last reset.</summary> + public long BytesCount { get { return bytesCount; } } + + /// <summary>Returns the number of generated items since last reset.</summary> + public int ItemsCount { get { return itemCount; } } + + public Config Config { get { return config; } } + + /// <summary>Returns the total number of bytes that were generated by this source.</summary> + public long TotalBytesCount { get { return totalBytesCount; } } + + /// <summary>Returns the total number of generated items.</summary> + public int TotalItemsCount { get { return totalItemCount; } } + + /// <summary> + /// Resets the input for this content source, so that the test would behave as + /// if it was just started, input-wise. + /// <para/> + /// <b>NOTE:</b> the default implementation resets the number of bytes and + /// items generated since the last reset, so it's important to call + /// <c>base.ResetInputs()</c> in case you override this method. + /// </summary> + public virtual void ResetInputs() + { + bytesCount = 0; + itemCount = 0; + } + + /// <summary> + /// Sets the <see cref="Utils.Config"/> for this content source. If you override this + /// method, you must call <c>base.SetConfig(config)</c>. + /// </summary> + /// <param name="config"></param> + public virtual void SetConfig(Config config) + { + this.config = config; + m_forever = config.Get("content.source.forever", true); + m_logStep = config.Get("content.source.log.step", 0); + m_verbose = config.Get("content.source.verbose", false); + string encodingStr = config.Get("content.source.encoding", null); + if (!string.IsNullOrWhiteSpace(encodingStr)) + { + m_encoding = Encoding.GetEncoding(encodingStr); + } + else + { + m_encoding = Encoding.GetEncoding(0); // Default system encoding + } + } + + public virtual void PrintStatistics(string itemsName) + { + if (!m_verbose) + { + return; + } + bool print = false; + string col = " "; + StringBuilder sb = new StringBuilder(); + string newline = Environment.NewLine; + sb.Append("------------> ").Append(GetType().GetTypeInfo().Name).Append(" statistics (").Append(printNum).Append("): ").Append(newline); + int nut = TotalItemsCount; + if (nut > lastPrintedNumUniqueTexts) + { + print = true; + sb.Append("total count of " + itemsName + ": ").Append(Formatter.Format(0, nut, col)).Append(newline); + lastPrintedNumUniqueTexts = nut; + } + long nub = TotalBytesCount; + if (nub > lastPrintedNumUniqueBytes) + { + print = true; + sb.Append("total bytes of " + itemsName + ": ").Append(Formatter.Format(0, nub, col)).Append(newline); + lastPrintedNumUniqueBytes = nub; + } + if (ItemsCount > 0) + { + print = true; + sb.Append("num " + itemsName + " added since last inputs reset: ").Append(Formatter.Format(0, ItemsCount, col)).Append(newline); + sb.Append("total bytes added for " + itemsName + " since last inputs reset: ").Append(Formatter.Format(0, BytesCount, col)).Append(newline); + } + if (print) + { + SystemConsole.WriteLine(sb.Append(newline).ToString()); + printNum++; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentSource.cs new file mode 100644 index 0000000..a3c39cb --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentSource.cs @@ -0,0 +1,38 @@ +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Represents content from a specified source, such as TREC, Reuters etc. A + /// <see cref="ContentSource"/> is responsible for creating <see cref="DocData"/> objects for + /// its documents to be consumed by <see cref="DocMaker"/>. It also keeps track + /// of various statistics, such as how many documents were generated, size in + /// bytes etc. + /// <para/> + /// For supported configuration parameters see <see cref="ContentItemsSource"/>. + /// </summary> + public abstract class ContentSource : ContentItemsSource + { + /// <summary> + /// Returns the next <see cref="DocData"/> from the content source. + /// Implementations must account for multi-threading, as multiple threads + /// can call this method simultaneously. + /// </summary> + public abstract DocData GetNextDocData(DocData docData); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/DemoHTMLParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/DemoHTMLParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/DemoHTMLParser.cs new file mode 100644 index 0000000..0903754 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/DemoHTMLParser.cs @@ -0,0 +1,259 @@ +// LUCENENET TODO: Use HTML Agility pack instead of SAX ? + +using Lucene.Net.Support; +using Sax.Net; +using Sax.Net.Helpers; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple HTML Parser extracting title, meta tags, and body text + /// that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>. + /// </summary> + public class DemoHTMLParser : IHTMLParser + { + /// <summary>The actual parser to read HTML documents.</summary> + public sealed class Parser + { + private readonly IDictionary<string, string> metaTags = new Dictionary<string, string>(); + private readonly string title, body; + + // LUCENENET specific - expose field through property + public IDictionary<string, string> MetaTags + { + get { return metaTags; } + } + + // LUCENENET specific - expose field through property + public string Title + { + get { return title; } + } + + // LUCENENET specific - expose field through property + public string Body + { + get { return body; } + } + + public Parser(TextReader reader) + : this(new InputSource(reader)) + { + } + + public Parser(InputSource source) + { + TagSoup.Net.Parser parser = new TagSoup.Net.Parser(); + + parser.SetFeature(TagSoup.Net.Parser.NAMESPACES_FEATURE, true); + + StringBuilder title = new StringBuilder(), body = new StringBuilder(); + DefaultHandler handler = new DefaultHandlerAnonymousHelper(this, title, body); + + parser.ContentHandler = handler; + parser.ErrorHandler = handler; + parser.Parse(source); + + // the javacc-based parser trimmed title (which should be done for HTML in all cases): + this.title = title.ToString().Trim(); + + // assign body text + this.body = body.ToString(); + } + + private class DefaultHandlerAnonymousHelper : DefaultHandler + { + private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0; + + private readonly Parser outerInstance; + private readonly StringBuilder title; + private readonly StringBuilder body; + + public DefaultHandlerAnonymousHelper(Parser outerInstance, StringBuilder title, StringBuilder body) + { + this.outerInstance = outerInstance; + this.title = title; + this.body = body; + } + + public override void StartElement(string uri, string localName, string qName, IAttributes atts) + { + if (inHEAD > 0) + { + if ("title".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + inTITLE++; + } + else + { + if ("meta".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + string name = atts.GetValue("name"); + if (name == null) + { + name = atts.GetValue("http-equiv"); + } + string val = atts.GetValue("content"); + if (name != null && val != null) + { + outerInstance.metaTags[name.ToLowerInvariant()] = val; + } + } + } + } + else if (inBODY > 0) + { + if (SUPPRESS_ELEMENTS.Contains(localName)) + { + suppressed++; + } + else if ("img".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + // the original javacc-based parser preserved <IMG alt="..."/> + // attribute as body text in [] parenthesis: + string alt = atts.GetValue("alt"); + if (alt != null) + { + body.Append('[').Append(alt).Append(']'); + } + } + } + else if ("body".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + inBODY++; + } + else if ("head".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + inHEAD++; + } + else if ("frameset".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + throw new SAXException("This parser does not support HTML framesets."); + } + } + + public override void EndElement(string uri, string localName, string qName) + { + if (inBODY > 0) + { + if ("body".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + inBODY--; + } + else if (ENDLINE_ELEMENTS.Contains(localName)) + { + body.Append('\n'); + } + else if (SUPPRESS_ELEMENTS.Contains(localName)) + { + suppressed--; + } + } + else if (inHEAD > 0) + { + if ("head".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + inHEAD--; + } + else if (inTITLE > 0 && "title".Equals(localName, StringComparison.OrdinalIgnoreCase)) + { + inTITLE--; + } + } + } + + public override void Characters(char[] ch, int start, int length) + { + if (inBODY > 0 && suppressed == 0) + { + body.Append(ch, start, length); + } + else if (inTITLE > 0) + { + title.Append(ch, start, length); + } + } + + public override InputSource ResolveEntity(string publicId, string systemId) + { + // disable network access caused by DTDs + return new InputSource(new StringReader("")); + } + } + + private static ISet<string> CreateElementNameSet(params string[] names) + { + return Collections.UnmodifiableSet(new HashSet<string>(names)); + } + + /// <summary>HTML elements that cause a line break (they are block-elements).</summary> + internal static readonly ISet<string> ENDLINE_ELEMENTS = CreateElementNameSet( + "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", + "pre", "hr", "blockquote", "address", "fieldset", "table", "form", + "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option" + ); + + /// <summary>HTML elements with contents that are ignored.</summary> + internal static readonly ISet<string> SUPPRESS_ELEMENTS = CreateElementNameSet( + "style", "script" + ); + } + public virtual DocData Parse(DocData docData, string name, DateTime? date, TextReader reader, TrecContentSource trecSrc) + { + try + { + return Parse(docData, name, date, new InputSource(reader), trecSrc); + } + catch (SAXException saxe) + { + throw new IOException("SAX exception occurred while parsing HTML document.", saxe); + } + } + + public virtual DocData Parse(DocData docData, string name, DateTime? date, InputSource source, TrecContentSource trecSrc) + { + Parser p = new Parser(source); + + // properties + IDictionary<string, string> props = p.MetaTags; + string dateStr; + if (props.TryGetValue("date", out dateStr) && dateStr != null) + { + DateTime? newDate = trecSrc.ParseDate(dateStr); + if (newDate != null) + { + date = newDate; + } + } + + docData.Clear(); + docData.Name = name; + docData.Body = p.Body; + docData.Title = p.Title; + docData.Props = props; + docData.SetDate(date); + return docData; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/DirContentSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/DirContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/DirContentSource.cs new file mode 100644 index 0000000..c14d578 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/DirContentSource.cs @@ -0,0 +1,259 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Support; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Text; + +// LUCENENET TODO: This had to be refactored significantly. We need tests to confirm it works. + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="ContentSource"/> using the Dir collection for its input. Supports + /// the following configuration parameters (on top of <see cref="ContentSource"/>): + /// <list type="bullet"> + /// <item><term>work.dir</term><description>specifies the working directory. Required if "docs.dir" denotes a relative path (<b>default=work</b>).</description></item> + /// <item><term>docs.dir</term><description>specifies the directory the Dir collection. Can be set to a relative path if "work.dir" is also specified (<b>default=dir-out</b>).</description></item> + /// </list> + /// </summary> + public class DirContentSource : ContentSource + { + /// <summary> + /// Iterator over the files in the directory. + /// </summary> + public class Iterator : IEnumerator<FileInfo> + { + + private class Comparer : IComparer<FileInfo> + { + public int Compare(FileInfo a, FileInfo b) + { + string a2 = a.ToString(); + string b2 = b.ToString(); + int diff = a2.Length - b2.Length; + + if (diff > 0) + { + while (diff-- > 0) + { + b2 = "0" + b2; + } + } + else if (diff < 0) + { + diff = -diff; + while (diff-- > 0) + { + a2 = "0" + a2; + } + } + + /* note it's reversed because we're going to push, + which reverses again */ + return b2.CompareToOrdinal(a2); + } + } + + internal int count = 0; + + internal Stack<FileInfo> stack = new Stack<FileInfo>(); + + /* this seems silly ... there must be a better way ... + not that this is good, but can it matter? */ + + private Comparer c = new Comparer(); + + private FileInfo current; + + public Iterator(DirectoryInfo f) + { + Push(f); + } + + internal void Push(DirectoryInfo f) + { + foreach (var dir in f.GetDirectories()) + { + Push(dir); + } + + Push(f.GetFiles("*.txt")); + } + + internal void Push(FileInfo[] files) + { + Array.Sort(files, c); + for (int i = 0; i < files.Length; i++) + { + // System.err.println("push " + files[i]); + stack.Push(files[i]); + } + } + + public virtual int Count + { + get { return count; } + } + + public virtual bool MoveNext() + { + if (stack.Count == 0) + { + current = null; + return false; + } + count++; + current = stack.Pop(); + // System.err.println("pop " + object); + return true; + } + + public virtual FileInfo Current + { + get { return current; } + } + + object IEnumerator.Current + { + get { return current; } + } + + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + protected virtual void Dispose(bool disposing) + { + } + + public virtual void Reset() + { + } + } + + private DirectoryInfo dataDir = null; + private int iteration = 0; + private Iterator inputFiles = null; + + private DateTime? ParseDate(string dateStr) + { + DateTime temp; + if (DateTime.TryParseExact(dateStr, "dd-MMM-yyyy hh:mm:ss.fff", CultureInfo.InvariantCulture, DateTimeStyles.None, out temp)) + { + return temp; + } + else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp)) + { + return temp; + } + + return null; + } + + protected override void Dispose(bool disposing) + { + if (disposing) + { + inputFiles = null; + } + } + + public override DocData GetNextDocData(DocData docData) + { + FileInfo f = null; + string name = null; + lock (this) + { + if (!inputFiles.MoveNext()) + { + // exhausted files, start a new round, unless forever set to false. + if (!m_forever) + { + throw new NoMoreDataException(); + } + inputFiles = new Iterator(dataDir); + iteration++; + } + f = inputFiles.Current; + // System.err.println(f); + name = f.FullName + "_" + iteration; + } + + string line = null; + string dateStr; + string title; + StringBuilder bodyBuf = new StringBuilder(1024); + + using (TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8)) + { + //First line is the date, 3rd is the title, rest is body + dateStr = reader.ReadLine(); + reader.ReadLine();//skip an empty line + title = reader.ReadLine(); + reader.ReadLine();//skip an empty line + while ((line = reader.ReadLine()) != null) + { + bodyBuf.Append(line).Append(' '); + } + } + AddBytes(f.Length); + + DateTime? date = ParseDate(dateStr); + + docData.Clear(); + docData.Name = name; + docData.Body = bodyBuf.ToString(); + docData.Title = title; + docData.SetDate(date); + return docData; + } + + public override void ResetInputs() + { + lock (this) + { + base.ResetInputs(); + inputFiles = new Iterator(dataDir); + iteration = 0; + } + } + + public override void SetConfig(Config config) + { + base.SetConfig(config); + + DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work")); + string d = config.Get("docs.dir", "dir-out"); + dataDir = new DirectoryInfo(d); + + inputFiles = new Iterator(dataDir); + + if (inputFiles == null) + { + throw new Exception("No txt files in dataDir: " + dataDir.FullName); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/DocData.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/DocData.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/DocData.cs new file mode 100644 index 0000000..9e68a4e --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/DocData.cs @@ -0,0 +1,73 @@ +using Lucene.Net.Documents; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Output of parsing (e.g. HTML parsing) of an input document. + /// </summary> + public class DocData + { + public string Name { get; set; } + public string Body { get; set; } + public string Title { get; set; } + private string date; + public int ID { get; set; } + public IDictionary<string, string> Props { get; set; } + + public void Clear() + { + Name = null; + Body = null; + Title = null; + date = null; + Props = null; + ID = -1; + } + + /// <summary> + /// Gets the date. If the ctor with <see cref="DateTime"/> was called, then the string + /// returned is the output of <see cref="DateTools.DateToString(DateTime, DateTools.Resolution)"/>. + /// Otherwise it's the string passed to the other ctor. + /// </summary> + public virtual string Date + { + get { return date; } + } + + public virtual void SetDate(DateTime? date) + { + if (date.HasValue) + { + SetDate(DateTools.DateToString(date.Value, DateTools.Resolution.SECOND)); + } + else + { + this.date = null; + } + } + + public virtual void SetDate(string date) + { + this.date = date; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/DocMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/DocMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/DocMaker.cs new file mode 100644 index 0000000..8ff3e7b --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/DocMaker.cs @@ -0,0 +1,511 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Documents; +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Text; +using System.Threading; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Creates <see cref="Document"/> objects. Uses a <see cref="ContentSource"/> to generate + /// <see cref="DocData"/> objects. + /// </summary> + /// <remarks> + /// Supports the following parameters: + /// <list type="bullet"> + /// <item><term>content.source</term><description>specifies the <see cref="ContentSource"/> class to use (default <b>SingleDocSource</b>).</description></item> + /// <item><term>doc.stored</term><description>specifies whether fields should be stored (default <b>false</b>).</description></item> + /// <item><term>doc.body.stored</term><description>specifies whether the body field should be stored (default = <b>doc.stored</b>).</description></item> + /// <item><term>doc.tokenized</term><description>specifies whether fields should be tokenized (default <b>true</b>).</description></item> + /// <item><term>doc.body.tokenized</term><description>specifies whether the body field should be tokenized (default = <b>doc.tokenized</b>).</description></item> + /// <item><term>doc.tokenized.norms</term><description>specifies whether norms should be stored in the index or not. (default <b>false</b>).</description></item> + /// <item><term>doc.body.tokenized.norms</term><description> + /// specifies whether norms should be stored in the index for the body field. + /// This can be set to true, while <c>doc.tokenized.norms</c> is set to false, to allow norms storing just + /// for the body field. (default <b>true</b>). + /// </description></item> + /// <item><term>doc.term.vector</term><description>specifies whether term vectors should be stored for fields (default <b>false</b>).</description></item> + /// <item><term>doc.term.vector.positions</term><description>specifies whether term vectors should be stored with positions (default <b>false</b>).</description></item> + /// <item><term>doc.term.vector.offsets</term><description>specifies whether term vectors should be stored with offsets (default <b>false</b>).</description></item> + /// <item><term>doc.store.body.bytes</term><description>specifies whether to store the raw bytes of the document's content in the document (default <b>false</b>).</description></item> + /// <item><term>doc.reuse.fields</term><description>specifies whether <see cref="Field"/> and <see cref="Document"/> objects should be reused (default <b>true</b>).</description></item> + /// <item><term>doc.index.props</term><description>specifies whether the properties returned by</description></item> + /// <item><term>doc.random.id.limit</term><description> + /// if specified, docs will be assigned random + /// IDs from 0 to this limit. This is useful with UpdateDoc + /// for testing performance of <see cref="Index.IndexWriter.UpdateDocument(Index.Term, IEnumerable{Index.IIndexableField})"/>. + /// <see cref="DocData.Props"/> will be indexed. (default <b>false</b>). + /// </description></item> + /// </list> + /// </remarks> + public class DocMaker : IDisposable + { + private class LeftOver + { + public DocData DocData { get; set; } + public int Count { get; set; } + } + + private Random r; + private int updateDocIDLimit; + + /// <summary> + /// Document state, supports reuse of field instances + /// across documents (see <c>reuseFields</c> parameter). + /// </summary> + protected class DocState + { + private readonly IDictionary<string, Field> fields; + private readonly IDictionary<string, Field> numericFields; + private readonly bool reuseFields; + internal readonly Document doc; + internal DocData docData = new DocData(); + + public DocState(bool reuseFields, FieldType ft, FieldType bodyFt) + { + + this.reuseFields = reuseFields; + + if (reuseFields) + { + fields = new Dictionary<string, Field>(); + numericFields = new Dictionary<string, Field>(); + + // Initialize the map with the default fields. + fields[BODY_FIELD] = new Field(BODY_FIELD, "", bodyFt); + fields[TITLE_FIELD] = new Field(TITLE_FIELD, "", ft); + fields[DATE_FIELD] = new Field(DATE_FIELD, "", ft); + fields[ID_FIELD] = new StringField(ID_FIELD, "", Field.Store.YES); + fields[NAME_FIELD] = new Field(NAME_FIELD, "", ft); + + numericFields[DATE_MSEC_FIELD] = new Int64Field(DATE_MSEC_FIELD, 0L, Field.Store.NO); + numericFields[TIME_SEC_FIELD] = new Int32Field(TIME_SEC_FIELD, 0, Field.Store.NO); + + doc = new Document(); + } + else + { + numericFields = null; + fields = null; + doc = null; + } + } + + /// <summary> + /// Returns a field corresponding to the field name. If + /// <c>reuseFields</c> was set to <c>true</c>, then it attempts to reuse a + /// <see cref="Field"/> instance. If such a field does not exist, it creates a new one. + /// </summary> + internal Field GetField(string name, FieldType ft) + { + if (!reuseFields) + { + return new Field(name, "", ft); + } + + Field f; + if (!fields.TryGetValue(name, out f) || f == null) + { + f = new Field(name, "", ft); + fields[name] = f; + } + return f; + } + + internal Field GetNumericField(string name, NumericType type) + { + Field f; + if (reuseFields) + { + numericFields.TryGetValue(name, out f); + } + else + { + f = null; + } + + if (f == null) + { + switch (type) + { + case NumericType.INT32: + f = new Int32Field(name, 0, Field.Store.NO); + break; + case NumericType.INT64: + f = new Int64Field(name, 0L, Field.Store.NO); + break; + case NumericType.SINGLE: + f = new SingleField(name, 0.0F, Field.Store.NO); + break; + case NumericType.DOUBLE: + f = new DoubleField(name, 0.0, Field.Store.NO); + break; + default: + throw new InvalidOperationException("Cannot get here"); + } + if (reuseFields) + { + numericFields[name] = f; + } + } + return f; + } + } + + private bool storeBytes = false; + + // LUCENENET specific: DateUtil not used + + // leftovers are thread local, because it is unsafe to share residues between threads + private ThreadLocal<LeftOver> leftovr = new ThreadLocal<LeftOver>(); + private ThreadLocal<DocState> docState = new ThreadLocal<DocState>(); + + public static readonly string BODY_FIELD = "body"; + public static readonly string TITLE_FIELD = "doctitle"; + public static readonly string DATE_FIELD = "docdate"; + public static readonly string DATE_MSEC_FIELD = "docdatenum"; + public static readonly string TIME_SEC_FIELD = "doctimesecnum"; + public static readonly string ID_FIELD = "docid"; + public static readonly string BYTES_FIELD = "bytes"; + public static readonly string NAME_FIELD = "docname"; + + protected Config m_config; + + protected FieldType m_valType; + protected FieldType m_bodyValType; + + protected ContentSource m_source; + protected bool m_reuseFields; + protected bool m_indexProperties; + + private readonly AtomicInt32 numDocsCreated = new AtomicInt32(); + + public DocMaker() + { + } + + // create a doc + // use only part of the body, modify it to keep the rest (or use all if size==0). + // reset the docdata properties so they are not added more than once. + private Document CreateDocument(DocData docData, int size, int cnt) + { + + DocState ds = GetDocState(); + Document doc = m_reuseFields ? ds.doc : new Document(); + doc.Fields.Clear(); + + // Set ID_FIELD + FieldType ft = new FieldType(m_valType); + ft.IsIndexed = true; + + Field idField = ds.GetField(ID_FIELD, ft); + int id; + if (r != null) + { + id = r.Next(updateDocIDLimit); + } + else + { + id = docData.ID; + if (id == -1) + { + id = numDocsCreated.GetAndIncrement(); + } + } + idField.SetStringValue(Convert.ToString(id, CultureInfo.InvariantCulture)); + doc.Add(idField); + + // Set NAME_FIELD + string name = docData.Name; + if (name == null) name = ""; + name = cnt < 0 ? name : name + "_" + cnt; + Field nameField = ds.GetField(NAME_FIELD, m_valType); + nameField.SetStringValue(name); + doc.Add(nameField); + + // Set DATE_FIELD + DateTime? date = null; + string dateString = docData.Date; + if (dateString != null) + { + // LUCENENET: TryParseExact needs a non-nullable DateTime to work. + DateTime temp; + if (DateTime.TryParseExact(dateString, new string[] { + // Original format from Java + "dd-MMM-yyyy HH:mm:ss", + // Actual format from the test files... + "yyyyMMddHHmmss" + }, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp)) + { + date = temp; + } + // LUCENENET: Hail Mary in case the formats above are not adequate + else if (DateTime.TryParse(dateString, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp)) + { + date = temp; + } + } + else + { + dateString = ""; + } + Field dateStringField = ds.GetField(DATE_FIELD, m_valType); + dateStringField.SetStringValue(dateString); + doc.Add(dateStringField); + + if (date == null) + { + // just set to right now + date = DateTime.Now; + } + + Field dateField = ds.GetNumericField(DATE_MSEC_FIELD, NumericType.INT64); + dateField.SetInt64Value(date.Value.Ticks); + doc.Add(dateField); + + //util.cal.setTime(date); + //int sec = util.cal.get(Calendar.HOUR_OF_DAY) * 3600 + util.cal.get(Calendar.MINUTE) * 60 + util.cal.get(Calendar.SECOND); + int sec = Convert.ToInt32(date.Value.ToUniversalTime().TimeOfDay.TotalSeconds); + + Field timeSecField = ds.GetNumericField(TIME_SEC_FIELD, NumericType.INT32); + timeSecField.SetInt32Value(sec); + doc.Add(timeSecField); + + // Set TITLE_FIELD + string title = docData.Title; + Field titleField = ds.GetField(TITLE_FIELD, m_valType); + titleField.SetStringValue(title == null ? "" : title); + doc.Add(titleField); + + string body = docData.Body; + if (body != null && body.Length > 0) + { + string bdy; + if (size <= 0 || size >= body.Length) + { + bdy = body; // use all + docData.Body = ""; // nothing left + } + else + { + // attempt not to break words - if whitespace found within next 20 chars... + for (int n = size - 1; n < size + 20 && n < body.Length; n++) + { + if (char.IsWhiteSpace(body[n])) + { + size = n; + break; + } + } + bdy = body.Substring(0, size - 0); // use part + docData.Body = body.Substring(size); // some left + } + Field bodyField = ds.GetField(BODY_FIELD, m_bodyValType); + bodyField.SetStringValue(bdy); + doc.Add(bodyField); + + if (storeBytes) + { + Field bytesField = ds.GetField(BYTES_FIELD, StringField.TYPE_STORED); + bytesField.SetBytesValue(Encoding.UTF8.GetBytes(bdy)); + doc.Add(bytesField); + } + } + + if (m_indexProperties) + { + var props = docData.Props; + if (props != null) + { + foreach (var entry in props) + { + Field f = ds.GetField((string)entry.Key, m_valType); + f.SetStringValue((string)entry.Value); + doc.Add(f); + } + docData.Props = null; + } + } + + //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); + return doc; + } + + private void ResetLeftovers() + { + leftovr.Value = null; + } + + protected virtual DocState GetDocState() + { + DocState ds = docState.Value; + if (ds == null) + { + ds = new DocState(m_reuseFields, m_valType, m_bodyValType); + docState.Value = ds; + } + return ds; + } + + /// <summary> + /// Closes the <see cref="DocMaker"/>. + /// </summary> + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + /// <summary> + /// Closes the <see cref="DocMaker"/>. The base implementation closes the + /// <see cref="ContentSource"/>, and it can be overridden to do more work (but make + /// sure to call <c>base.Dispose(bool)</c>). + /// </summary> + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + m_source.Dispose(); + } + } + + /// <summary> + /// Creates a <see cref="Document"/> object ready for indexing. This method uses the + /// <see cref="ContentSource"/> to get the next document from the source, and creates + /// a <see cref="Document"/> object from the returned fields. If + /// <c>reuseFields</c> was set to <c>true</c>, it will reuse <see cref="Document"/> + /// and <see cref="Field"/> instances. + /// </summary> + /// <returns></returns> + public virtual Document MakeDocument() + { + ResetLeftovers(); + DocData docData = m_source.GetNextDocData(GetDocState().docData); + Document doc = CreateDocument(docData, 0, -1); + return doc; + } + + /// <summary> + /// Same as <see cref="MakeDocument()"/>, only this method creates a document of the + /// given size input by <paramref name="size"/>. + /// </summary> + public virtual Document MakeDocument(int size) + { + LeftOver lvr = leftovr.Value; + if (lvr == null || lvr.DocData == null || lvr.DocData.Body == null + || lvr.DocData.Body.Length == 0) + { + ResetLeftovers(); + } + DocData docData = GetDocState().docData; + DocData dd = (lvr == null ? m_source.GetNextDocData(docData) : lvr.DocData); + int cnt = (lvr == null ? 0 : lvr.Count); + while (dd.Body == null || dd.Body.Length < size) + { + DocData dd2 = dd; + dd = m_source.GetNextDocData(new DocData()); + cnt = 0; + dd.Body = (dd2.Body + dd.Body); + } + Document doc = CreateDocument(dd, size, cnt); + if (dd.Body == null || dd.Body.Length == 0) + { + ResetLeftovers(); + } + else + { + if (lvr == null) + { + lvr = new LeftOver(); + leftovr.Value = lvr; + } + lvr.DocData = dd; + lvr.Count = ++cnt; + } + return doc; + } + + /// <summary>Reset inputs so that the test run would behave, input wise, as if it just started.</summary> + public virtual void ResetInputs() + { + m_source.PrintStatistics("docs"); + // re-initiate since properties by round may have changed. + SetConfig(m_config, m_source); + m_source.ResetInputs(); + numDocsCreated.Set(0); + ResetLeftovers(); + } + + /// <summary>Set the configuration parameters of this doc maker.</summary> + public virtual void SetConfig(Config config, ContentSource source) + { + this.m_config = config; + this.m_source = source; + + bool stored = config.Get("doc.stored", false); + bool bodyStored = config.Get("doc.body.stored", stored); + bool tokenized = config.Get("doc.tokenized", true); + bool bodyTokenized = config.Get("doc.body.tokenized", tokenized); + bool norms = config.Get("doc.tokenized.norms", false); + bool bodyNorms = config.Get("doc.body.tokenized.norms", true); + bool termVec = config.Get("doc.term.vector", false); + bool termVecPositions = config.Get("doc.term.vector.positions", false); + bool termVecOffsets = config.Get("doc.term.vector.offsets", false); + + m_valType = new FieldType(TextField.TYPE_NOT_STORED); + m_valType.IsStored = stored; + m_valType.IsTokenized = tokenized; + m_valType.OmitNorms = !norms; + m_valType.StoreTermVectors = termVec; + m_valType.StoreTermVectorPositions = termVecPositions; + m_valType.StoreTermVectorOffsets = termVecOffsets; + m_valType.Freeze(); + + m_bodyValType = new FieldType(TextField.TYPE_NOT_STORED); + m_bodyValType.IsStored = bodyStored; + m_bodyValType.IsTokenized = bodyTokenized; + m_bodyValType.OmitNorms = !bodyNorms; + m_bodyValType.StoreTermVectors = termVec; + m_bodyValType.StoreTermVectorPositions = termVecPositions; + m_bodyValType.StoreTermVectorOffsets = termVecOffsets; + m_bodyValType.Freeze(); + + storeBytes = config.Get("doc.store.body.bytes", false); + + m_reuseFields = config.Get("doc.reuse.fields", true); + + // In a multi-rounds run, it is important to reset DocState since settings + // of fields may change between rounds, and this is the only way to reset + // the cache of all threads. + docState = new ThreadLocal<DocState>(); + + m_indexProperties = config.Get("doc.index.props", false); + + updateDocIDLimit = config.Get("doc.random.id.limit", -1); + if (updateDocIDLimit != -1) + { + r = new Random(179); + } + } + } +}
