http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SortableSingleDocSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SortableSingleDocSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SortableSingleDocSource.cs new file mode 100644 index 0000000..c83828c --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SortableSingleDocSource.cs @@ -0,0 +1,114 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using System; +using System.Collections.Generic; +using System.Globalization; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Adds fields appropriate for sorting: country, random_string and sort_field + /// (int). Supports the following parameters: + /// <list type="bullet"> + /// <item><term><b>sort.rng</b></term><description>defines the range for sort-by-int field (default <b>20000</b>).</description></item> + /// <item><term><b>rand.seed</b></term><description>defines the seed to initialize Random with (default <b>13</b>).</description></item> + /// </list> + /// </summary> + public class SortableSingleDocSource : SingleDocSource + { + private static string[] COUNTRIES = new string[] { + "European Union", "United States", "Japan", "Germany", "China (PRC)", + "United Kingdom", "France", "Italy", "Spain", "Canada", "Brazil", "Russia", + "India", "South Korea", "Australia", "Mexico", "Netherlands", "Turkey", + "Sweden", "Belgium", "Indonesia", "Switzerland", "Poland", "Norway", + "Republic of China", "Saudi Arabia", "Austria", "Greece", "Denmark", "Iran", + "South Africa", "Argentina", "Ireland", "Thailand", "Finland", "Venezuela", + "Portugal", "Hong Kong", "United Arab Emirates", "Malaysia", + "Czech Republic", "Colombia", "Nigeria", "Romania", "Chile", "Israel", + "Singapore", "Philippines", "Pakistan", "Ukraine", "Hungary", "Algeria", + "New Zealand", "Egypt", "Kuwait", "Peru", "Kazakhstan", "Slovakia", + "Morocco", "Bangladesh", "Vietnam", "Qatar", "Angola", "Libya", "Iraq", + "Croatia", "Luxembourg", "Sudan", "Slovenia", "Cuba", "Belarus", "Ecuador", + "Serbia", "Oman", "Bulgaria", "Lithuania", "Syria", "Dominican Republic", + "Tunisia", "Guatemala", "Azerbaijan", "Sri Lanka", "Kenya", "Latvia", + "Turkmenistan", "Costa Rica", "Lebanon", "Uruguay", "Uzbekistan", "Yemen", + "Cyprus", "Estonia", "Trinidad and Tobago", "Cameroon", "El Salvador", + "Iceland", "Panama", "Bahrain", "Ivory Coast", "Ethiopia", "Tanzania", + "Jordan", "Ghana", "Bosnia and Herzegovina", "Macau", "Burma", "Bolivia", + "Brunei", "Botswana", "Honduras", "Gabon", "Uganda", "Jamaica", "Zambia", + "Senegal", "Paraguay", "Albania", "Equatorial Guinea", "Georgia", + "Democratic Republic of the Congo", "Nepal", "Afghanistan", "Cambodia", + "Armenia", "Republic of the Congo", "Mozambique", "Republic of Macedonia", + "Malta", "Namibia", "Madagascar", "Chad", "Burkina Faso", "Mauritius", + "Mali", "The Bahamas", "Papua New Guinea", "Nicaragua", "Haiti", "Benin", + "alestinian flag West Bank and Gaza", "Jersey", "Fiji", "Guinea", "Moldova", + "Niger", "Laos", "Mongolia", "French Polynesia", "Kyrgyzstan", "Barbados", + "Tajikistan", "Malawi", "Liechtenstein", "New Caledonia", "Kosovo", + "Rwanda", "Montenegro", "Swaziland", "Guam", "Mauritania", "Guernsey", + "Isle of Man", "Togo", "Somalia", "Suriname", "Aruba", "North Korea", + "Zimbabwe", "Central African Republic", "Faroe Islands", "Greenland", + "Sierra Leone", "Lesotho", "Cape Verde", "Eritrea", "Bhutan", "Belize", + "Antigua and Barbuda", "Gibraltar", "Maldives", "San Marino", "Guyana", + "Burundi", "Saint Lucia", "Djibouti", "British Virgin Islands", "Liberia", + "Seychelles", "The Gambia", "Northern Mariana Islands", "Grenada", + "Saint Vincent and the Grenadines", "Saint Kitts and Nevis", "East Timor", + "Vanuatu", "Comoros", "Samoa", "Solomon Islands", "Guinea-Bissau", + "American Samoa", "Dominica", "Micronesia", "Tonga", "Cook Islands", + "Palau", "Marshall Islands", "S�o Tom� and Pr�ncipe", "Anguilla", + "Kiribati", "Tuvalu", "Niue" }; + + private int sortRange; + private Random r; + + public override DocData GetNextDocData(DocData docData) + { + docData = base.GetNextDocData(docData); + var props = new Dictionary<string, string>(); + + // random int + props["sort_field"] = r.Next(sortRange).ToString(CultureInfo.InvariantCulture); + + // random string + int len = NextInt32(2, 20); + char[] buffer = new char[len]; + for (int i = 0; i < len; i++) + { + buffer[i] = (char)r.Next(0x80); + } + props["random_string"] = new string(buffer); + + // random country + props["country"] = COUNTRIES[r.Next(COUNTRIES.Length)]; + docData.Props = props; + return docData; + } + + private int NextInt32(int start, int end) + { + return start + r.Next(end - start); + } + + public override void SetConfig(Config config) + { + base.SetConfig(config); + sortRange = config.Get("sort.rng", 20000); + r = new Random(config.Get("rand.seed", 13)); + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialDocMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialDocMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialDocMaker.cs new file mode 100644 index 0000000..7879cd8 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialDocMaker.cs @@ -0,0 +1,249 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Documents; +using Lucene.Net.Spatial; +using Lucene.Net.Spatial.Prefix; +using Lucene.Net.Spatial.Prefix.Tree; +using Lucene.Net.Support; +using Spatial4n.Core.Context; +using Spatial4n.Core.Shapes; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Indexes spatial data according to a configured <see cref="SpatialStrategy"/> with optional + /// shape transformation via a configured <see cref="IShapeConverter"/>. The converter can turn points into + /// circles and bounding boxes, in order to vary the type of indexing performance tests. + /// Unless it's subclass-ed to do otherwise, this class configures a <see cref="SpatialContext"/>, + /// <see cref="SpatialPrefixTree"/>, and <see cref="RecursivePrefixTreeStrategy"/>. The Strategy is made + /// available to a query maker via the static method <see cref="GetSpatialStrategy(int)"/>. + /// See spatial.alg for a listing of spatial parameters, in particular those starting with "spatial." + /// and "doc.spatial". + /// </summary> + public class SpatialDocMaker : DocMaker + { + public static readonly string SPATIAL_FIELD = "spatial"; + + //cache spatialStrategy by round number + private static IDictionary<int, SpatialStrategy> spatialStrategyCache = new Dictionary<int, SpatialStrategy>(); + + private SpatialStrategy strategy; + private IShapeConverter shapeConverter; + + /// <summary> + /// Looks up the <see cref="SpatialStrategy"/> from the given round -- + /// <see cref="Config.RoundNumber"/>. It's an error + /// if it wasn't created already for this round -- when <see cref="SpatialDocMaker"/> is initialized. + /// </summary> + public static SpatialStrategy GetSpatialStrategy(int roundNumber) + { + SpatialStrategy result; + if (!spatialStrategyCache.TryGetValue(roundNumber, out result) || result == null) + { + throw new InvalidOperationException("Strategy should have been init'ed by SpatialDocMaker by now"); + } + return result; + } + + /// <summary> + /// Builds a <see cref="SpatialStrategy"/> from configuration options. + /// </summary> + protected virtual SpatialStrategy MakeSpatialStrategy(Config config) + { + //A Map view of Config that prefixes keys with "spatial." + var configMap = new DictionaryAnonymousHelper(config); + + SpatialContext ctx = SpatialContextFactory.MakeSpatialContext(configMap /*, null*/); // LUCENENET TODO: What is this extra param? + + //Some day the strategy might be initialized with a factory but such a factory + // is non-existent. + return MakeSpatialStrategy(config, configMap, ctx); + } + + private class DictionaryAnonymousHelper : Dictionary<string, string> + { + private readonly Config config; + public DictionaryAnonymousHelper(Config config) + { + this.config = config; + } + + // LUCENENET TODO: EntrySet not supported. Should we throw on GetEnumerator()? + + new public string this[string key] + { + get { return config.Get("spatial." + key, null); } + } + } + + protected virtual SpatialStrategy MakeSpatialStrategy(Config config, IDictionary<string, string> configMap, + SpatialContext ctx) + { + //A factory for the prefix tree grid + SpatialPrefixTree grid = SpatialPrefixTreeFactory.MakeSPT(configMap, /*null,*/ ctx); // LUCENENET TODO: What is this extra param? + + RecursivePrefixTreeStrategy strategy = new RecursivePrefixTreeStrategyAnonymousHelper(grid, SPATIAL_FIELD, config); + + int prefixGridScanLevel = config.Get("query.spatial.prefixGridScanLevel", -4); + if (prefixGridScanLevel < 0) + prefixGridScanLevel = grid.MaxLevels + prefixGridScanLevel; + strategy.PrefixGridScanLevel = prefixGridScanLevel; + + double distErrPct = config.Get("spatial.distErrPct", .025);//doc & query; a default + strategy.DistErrPct = distErrPct; + return strategy; + } + + private class RecursivePrefixTreeStrategyAnonymousHelper : RecursivePrefixTreeStrategy + { + public RecursivePrefixTreeStrategyAnonymousHelper(SpatialPrefixTree grid, string fieldName, Config config) + : base(grid, fieldName) + { + this.m_pointsOnly = config.Get("spatial.docPointsOnly", false); + } + } + + public override void SetConfig(Config config, ContentSource source) + { + base.SetConfig(config, source); + SpatialStrategy existing; + if (!spatialStrategyCache.TryGetValue(config.RoundNumber, out existing) || existing == null) + { + //new round; we need to re-initialize + strategy = MakeSpatialStrategy(config); + spatialStrategyCache[config.RoundNumber] = strategy; + //TODO remove previous round config? + shapeConverter = MakeShapeConverter(strategy, config, "doc.spatial."); + SystemConsole.WriteLine("Spatial Strategy: " + strategy); + } + } + + /// <summary> + /// Optionally converts points to circles, and optionally bbox'es result. + /// </summary> + public static IShapeConverter MakeShapeConverter(SpatialStrategy spatialStrategy, + Config config, string configKeyPrefix) + { + //by default does no conversion + double radiusDegrees = config.Get(configKeyPrefix + "radiusDegrees", 0.0); + double plusMinus = config.Get(configKeyPrefix + "radiusDegreesRandPlusMinus", 0.0); + bool bbox = config.Get(configKeyPrefix + "bbox", false); + + return new ShapeConverterAnonymousHelper(spatialStrategy, radiusDegrees, plusMinus, bbox); + } + + private class ShapeConverterAnonymousHelper : IShapeConverter + { + private readonly SpatialStrategy spatialStrategy; + private readonly double radiusDegrees; + private readonly double plusMinus; + private readonly bool bbox; + + public ShapeConverterAnonymousHelper(SpatialStrategy spatialStrategy, double radiusDegrees, double plusMinus, bool bbox) + { + this.spatialStrategy = spatialStrategy; + this.radiusDegrees = radiusDegrees; + this.plusMinus = plusMinus; + this.bbox = bbox; + } + + public IShape Convert(IShape shape) + { + if (shape is IPoint && (radiusDegrees != 0.0 || plusMinus != 0.0)) + { + IPoint point = (IPoint)shape; + double radius = radiusDegrees; + if (plusMinus > 0.0) + { + Random random = new Random(point.GetHashCode());//use hashCode so it's reproducibly random + radius += random.NextDouble() * 2 * plusMinus - plusMinus; + radius = Math.Abs(radius);//can happen if configured plusMinus > radiusDegrees + } + shape = spatialStrategy.SpatialContext.MakeCircle(point, radius); + } + if (bbox) + { + shape = shape.BoundingBox; + } + return shape; + } + } + + // LUCENENET specific: de-nested IShapeConverter + + public override Document MakeDocument() + { + + DocState docState = GetDocState(); + + Document doc = base.MakeDocument(); + + // Set SPATIAL_FIELD from body + DocData docData = docState.docData; + // makeDocument() resets docState.getBody() so we can't look there; look in Document + string shapeStr = doc.GetField(DocMaker.BODY_FIELD).GetStringValue(); + IShape shape = MakeShapeFromString(strategy, docData.Name, shapeStr); + if (shape != null) + { + shape = shapeConverter.Convert(shape); + //index + foreach (Field f in strategy.CreateIndexableFields(shape)) + { + doc.Add(f); + } + } + + return doc; + } + + public static IShape MakeShapeFromString(SpatialStrategy strategy, string name, string shapeStr) + { + if (shapeStr != null && shapeStr.Length > 0) + { + try + { + return strategy.SpatialContext.ReadShapeFromWkt(shapeStr); + } + catch (Exception e) + {//InvalidShapeException TODO + SystemConsole.Error.WriteLine("Shape " + name + " wasn't parseable: " + e + " (skipping it)"); + return null; + } + } + return null; + } + + public override Document MakeDocument(int size) + { + //TODO consider abusing the 'size' notion to number of shapes per document + throw new NotSupportedException(); + } + } + + /// <summary> + /// Converts one shape to another. Created by + /// <see cref="MakeShapeConverter(SpatialStrategy, Config, string)"/>. + /// </summary> + public interface IShapeConverter + { + IShape Convert(IShape shape); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialFileQueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialFileQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialFileQueryMaker.cs new file mode 100644 index 0000000..d583d22 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SpatialFileQueryMaker.cs @@ -0,0 +1,131 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Queries; +using Lucene.Net.Queries.Function; +using Lucene.Net.Search; +using Lucene.Net.Spatial; +using Lucene.Net.Spatial.Queries; +using Spatial4n.Core.Shapes; +using System.Collections.Generic; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Reads spatial data from the body field docs from an internally created <see cref="LineDocSource"/>. + /// It's parsed by <see cref="Spatial4n.Core.Context.SpatialContext.ReadShapeFromWkt(string)"/> and then + /// further manipulated via a configurable <see cref="IShapeConverter"/>. When using point + /// data, it's likely you'll want to configure the shape converter so that the query shapes actually + /// cover a region. The queries are all created & cached in advance. This query maker works in + /// conjunction with <see cref="SpatialDocMaker"/>. See spatial.alg for a listing of options, in + /// particular the options starting with "query.". + /// </summary> + public class SpatialFileQueryMaker : AbstractQueryMaker + { + protected SpatialStrategy m_strategy; + protected double m_distErrPct;//NaN if not set + protected SpatialOperation m_operation; + protected bool m_score; + + protected IShapeConverter m_shapeConverter; + + public override void SetConfig(Config config) + { + m_strategy = SpatialDocMaker.GetSpatialStrategy(config.RoundNumber); + m_shapeConverter = SpatialDocMaker.MakeShapeConverter(m_strategy, config, "query.spatial."); + + m_distErrPct = config.Get("query.spatial.distErrPct", double.NaN); + m_operation = SpatialOperation.Get(config.Get("query.spatial.predicate", "Intersects")); + m_score = config.Get("query.spatial.score", false); + + base.SetConfig(config);//call last, will call prepareQueries() + } + + protected override Query[] PrepareQueries() + { + int maxQueries = m_config.Get("query.file.maxQueries", 1000); + Config srcConfig = new Config(new Dictionary<string, string>()); + srcConfig.Set("docs.file", m_config.Get("query.file", null)); + srcConfig.Set("line.parser", m_config.Get("query.file.line.parser", null)); + srcConfig.Set("content.source.forever", "false"); + + List<Query> queries = new List<Query>(); + LineDocSource src = new LineDocSource(); + try + { + src.SetConfig(srcConfig); + src.ResetInputs(); + DocData docData = new DocData(); + for (int i = 0; i < maxQueries; i++) + { + docData = src.GetNextDocData(docData); + IShape shape = SpatialDocMaker.MakeShapeFromString(m_strategy, docData.Name, docData.Body); + if (shape != null) + { + shape = m_shapeConverter.Convert(shape); + queries.Add(MakeQueryFromShape(shape)); + } + else + { + i--;//skip + } + } + } +#pragma warning disable 168 + catch (NoMoreDataException e) +#pragma warning restore 168 + { + //all-done + } + finally + { + src.Dispose(); + } + return queries.ToArray(); + } + + + protected virtual Query MakeQueryFromShape(IShape shape) + { + SpatialArgs args = new SpatialArgs(m_operation, shape); + if (!double.IsNaN(m_distErrPct)) + args.DistErrPct = m_distErrPct; + + if (m_score) + { + ValueSource valueSource = m_strategy.MakeDistanceValueSource(shape.Center); + return new CustomScoreQuery(m_strategy.MakeQuery(args), new FunctionQuery(valueSource)); + } + else + { + //strategy.makeQuery() could potentially score (isn't well defined) so instead we call + // makeFilter() and wrap + + Filter filter = m_strategy.MakeFilter(args); + if (filter is QueryWrapperFilter) + { + return ((QueryWrapperFilter)filter).Query; + } + else + { + return new ConstantScoreQuery(filter); + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs new file mode 100644 index 0000000..d84a25d --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs @@ -0,0 +1,350 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Text; +using System.Threading; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Implements a <see cref="ContentSource"/> over the TREC collection. + /// </summary> + /// <remarks> + /// Supports the following configuration parameters (on top of + /// <see cref="ContentSource"/>): + /// <list type="bullet"> + /// <item><term>work.dir</term><description>specifies the working directory. Required if "docs.dir" + /// denotes a relative path (<b>default=work</b>).</description></item> + /// <item><term>docs.dir</term><description>specifies the directory where the TREC files reside. + /// Can be set to a relative path if "work.dir" is also specified + /// (<b>default=trec</b>). + /// </description></item> + /// <item><term>trec.doc.parser</term><description>specifies the <see cref="TrecDocParser"/> class to use for + /// parsing the TREC documents content (<b>default=TrecGov2Parser</b>). + /// </description></item> + /// <item><term>html.parser</term><description>specifies the <see cref="IHTMLParser"/> class to use for + /// parsing the HTML parts of the TREC documents content (<b>default=DemoHTMLParser</b>). + /// </description></item> + /// <item><term>content.source.encoding</term><description>if not specified, ISO-8859-1 is used.</description></item> + /// <item>content.source.excludeIteration<term></term><description>if <c>true</c>, do not append iteration number to docname</description></item> + /// </list> + /// </remarks> + public class TrecContentSource : ContentSource + { + // LUCENENET specific - DateFormatInfo not used + + public static readonly string DOCNO = "<DOCNO>"; + public static readonly string TERMINATING_DOCNO = "</DOCNO>"; + public static readonly string DOC = "<DOC>"; + public static readonly string TERMINATING_DOC = "</DOC>"; + + /// <summary>separator between lines in the buffer</summary> + public static readonly string NEW_LINE = Environment.NewLine; + + private static readonly string[] DATE_FORMATS = { + // LUCENENET specific: in JAVA, they don't care if it is an abbreviated or a full month name when parsing + // so we provide definitions for both ways. + "ddd, dd MMM yyyy hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT + "ddd, dd MMMM yyyy hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT + "ddd MMM dd hh:mm:ss yyyy K", // Tue Dec 09 16:45:08 2003 EST + "ddd MMMM dd hh:mm:ss yyyy K", // Tue December 09 16:45:08 2003 EST + "ddd, dd-MMM-':'y hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT + "ddd, dd-MMMM-':'y hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT + "ddd, dd-MMM-yyy hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT + "ddd, dd-MMMM-yyy hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT + "ddd MMM dd hh:mm:ss yyyy", // Tue Dec 09 16:45:08 2003 + "ddd MMMM dd hh:mm:ss yyyy", // Tue December 09 16:45:08 2003 + "dd MMM yyyy", // 1 Mar 1994 + "dd MMMM yyyy", // 1 March 1994 + "MMM dd, yyyy", // Feb 3, 1994 + "MMMM dd, yyyy", // February 3, 1994 + "yyMMdd", // 910513 + "hhmm K.K.K. MMM dd, yyyy", // 0901 u.t.c. Apr 28, 1994 + "hhmm K.K.K. MMMM dd, yyyy", // 0901 u.t.c. April 28, 1994 + }; + + private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>(); + private DirectoryInfo dataDir = null; + private List<FileInfo> inputFiles = new List<FileInfo>(); + private int nextFile = 0; + // Use to synchronize threads on reading from the TREC documents. + private object @lock = new object(); + + // Required for test + internal TextReader reader; + internal int iteration = 0; + internal IHTMLParser htmlParser; + + private bool excludeDocnameIteration; + private TrecDocParser trecDocParser = new TrecGov2Parser(); // default + internal TrecDocParser.ParsePathType currPathType; // not private for tests + + private StringBuilder GetDocBuffer() + { + StringBuilder sb = trecDocBuffer.Value; + if (sb == null) + { + sb = new StringBuilder(); + trecDocBuffer.Value = sb; + } + return sb; + } + + internal IHTMLParser HtmlParser + { + get { return htmlParser; } + } + + /// <summary> + /// Read until a line starting with the specified <paramref name="lineStart"/>. + /// </summary> + /// <param name="buf">Buffer for collecting the data if so specified.</param> + /// <param name="lineStart">Line start to look for, must not be <c>null</c>.</param> + /// <param name="collectMatchLine">Whether to collect the matching line into <c>buffer</c>.</param> + /// <param name="collectAll">Whether to collect all lines into <c>buffer</c>.</param> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + /// <exception cref="NoMoreDataException">If the source is exhausted.</exception> + private void Read(StringBuilder buf, string lineStart, + bool collectMatchLine, bool collectAll) + { + string sep = ""; + while (true) + { + string line = reader.ReadLine(); + + if (line == null) + { + OpenNextFile(); + continue; + } + + var _ = line.Length; + + if (lineStart != null && line.StartsWith(lineStart, StringComparison.Ordinal)) + { + if (collectMatchLine) + { + buf.Append(sep).Append(line); + sep = NEW_LINE; + } + return; + } + + if (collectAll) + { + buf.Append(sep).Append(line); + sep = NEW_LINE; + } + } + } + + internal virtual void OpenNextFile() + { + Dispose(); + //currPathType = null; + while (true) + { + if (nextFile >= inputFiles.Count) + { + // exhausted files, start a new round, unless forever set to false. + if (!m_forever) + { + throw new NoMoreDataException(); + } + nextFile = 0; + iteration++; + } + FileInfo f = inputFiles[nextFile++]; + if (m_verbose) + { + SystemConsole.WriteLine("opening: " + f + " length: " + f.Length); + } + try + { + Stream inputStream = StreamUtils.GetInputStream(f); // support either gzip, bzip2, or regular text file, by extension + reader = new StreamReader(inputStream, m_encoding); + currPathType = TrecDocParser.PathType(f); + return; + } + catch (Exception e) + { + if (m_verbose) + { + SystemConsole.WriteLine("Skipping 'bad' file " + f.FullName + " due to " + e.Message); + continue; + } + throw new NoMoreDataException(); + } + } + } + + public virtual DateTime? ParseDate(string dateStr) + { + dateStr = dateStr.Trim(); + DateTime d; + if (DateTime.TryParseExact(dateStr, DATE_FORMATS, CultureInfo.InvariantCulture, DateTimeStyles.None, out d)) + { + return d; + } + else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out d)) + { + return d; + } + + // do not fail test just because a date could not be parsed + if (m_verbose) + { + SystemConsole.WriteLine("failed to parse date (assigning 'now') for: " + dateStr); + } + return null; + } + + protected override void Dispose(bool disposing) + { + if (reader == null) + { + return; + } + + try + { + reader.Dispose(); + } + catch (IOException e) + { + if (m_verbose) + { + SystemConsole.WriteLine("failed to dispose reader !"); + SystemConsole.WriteLine(e.ToString()); + } + } + reader = null; + } + + public override DocData GetNextDocData(DocData docData) + { + string name = null; + StringBuilder docBuf = GetDocBuffer(); + TrecDocParser.ParsePathType parsedPathType; + + // protect reading from the TREC files by multiple threads. The rest of the + // method, i.e., parsing the content and returning the DocData can run unprotected. + lock (@lock) + { + if (reader == null) + { + OpenNextFile(); + } + + // 1. skip until doc start - required for all TREC formats + docBuf.Length = 0; + Read(docBuf, DOC, false, false); + + // save parsedFile for passing trecDataParser after the sync block, in + // case another thread will open another file in between. + parsedPathType = currPathType; + + // 2. name - required for all TREC formats + docBuf.Length = 0; + Read(docBuf, DOCNO, true, false); + name = docBuf.ToString(DOCNO.Length, docBuf.IndexOf(TERMINATING_DOCNO, + DOCNO.Length) - DOCNO.Length).Trim(); + + if (!excludeDocnameIteration) + { + name = name + "_" + iteration; + } + + // 3. read all until end of doc + docBuf.Length = 0; + Read(docBuf, TERMINATING_DOC, false, true); + } + + // count char length of text to be parsed (may be larger than the resulted plain doc body text). + AddBytes(docBuf.Length); + + // This code segment relies on HtmlParser being thread safe. When we get + // here, everything else is already private to that thread, so we're safe. + docData = trecDocParser.Parse(docData, name, this, docBuf, parsedPathType); + AddItem(); + + return docData; + } + + public override void ResetInputs() + { + lock (@lock) + { + base.ResetInputs(); + Dispose(); + nextFile = 0; + iteration = 0; + } + } + + public override void SetConfig(Config config) + { + base.SetConfig(config); + // dirs + DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work")); + string d = config.Get("docs.dir", "trec"); + dataDir = new DirectoryInfo(d); + // files + CollectFiles(dataDir, inputFiles); + if (inputFiles.Count == 0) + { + throw new ArgumentException("No files in dataDir: " + dataDir); + } + // trec doc parser + try + { + string trecDocParserClassName = config.Get("trec.doc.parser", "Lucene.Net.Benchmarks.ByTask.Feeds.TrecGov2Parser, Lucene.Net.Benchmark"); + trecDocParser = (TrecDocParser)Activator.CreateInstance(Type.GetType(trecDocParserClassName)); + } + catch (Exception e) + { + // Should not get here. Throw runtime exception. + throw new Exception(e.ToString(), e); + } + // html parser + try + { + string htmlParserClassName = config.Get("html.parser", + "Lucene.Net.Benchmarks.ByTask.Feeds.DemoHTMLParser, Lucene.Net.Benchmark"); + htmlParser = (IHTMLParser)Activator.CreateInstance(Type.GetType(htmlParserClassName)); + } + catch (Exception e) + { + // Should not get here. Throw runtime exception. + throw new Exception(e.ToString(), e); + } + // encoding + if (m_encoding == null) + { + m_encoding = Encoding.GetEncoding("iso-8859-1"); //StandardCharsets.ISO_8859_1.name(); + } + // iteration exclusion in doc name + excludeDocnameIteration = config.Get("content.source.excludeIteration", false); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecDocParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecDocParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecDocParser.cs new file mode 100644 index 0000000..b67a1c0 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecDocParser.cs @@ -0,0 +1,159 @@ +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO> + /// which are handled in TrecContentSource. Required to be stateless and hence thread safe. + /// </summary> + public abstract class TrecDocParser + { + /// <summary>Types of trec parse paths,</summary> + public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES, UNKNOWN } + + /// <summary>trec parser type used for unknown extensions</summary> + public static readonly ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2; + + internal static readonly IDictionary<ParsePathType, TrecDocParser> pathType2parser = new Dictionary<ParsePathType, TrecDocParser>(); + static TrecDocParser() + { + pathType2parser[ParsePathType.GOV2] = new TrecGov2Parser(); + pathType2parser[ParsePathType.FBIS] = new TrecFBISParser(); + pathType2parser[ParsePathType.FR94] = new TrecFR94Parser(); + pathType2parser[ParsePathType.FT] = new TrecFTParser(); + pathType2parser[ParsePathType.LATIMES] = new TrecLATimesParser(); + + foreach (ParsePathType ppt in Enum.GetValues(typeof(ParsePathType))) + { + pathName2Type[ppt.ToString().ToUpperInvariant()] = ppt; + } + } + + internal static readonly IDictionary<string, ParsePathType?> pathName2Type = new Dictionary<string, ParsePathType?>(); + + + /// <summary>max length of walk up from file to its ancestors when looking for a known path type.</summary> + private static readonly int MAX_PATH_LENGTH = 10; + + /// <summary> + /// Compute the path type of a file by inspecting name of file and its parents. + /// </summary> + public static ParsePathType PathType(FileInfo f) + { + int pathLength = 0; + ParsePathType? ppt; + if (pathName2Type.TryGetValue(f.Name.ToUpperInvariant(), out ppt) && ppt != null) + { + return ppt.Value; + } + // Walk up the directory names to find a match. + DirectoryInfo parentDir = f.Directory; + while (parentDir != null && ++pathLength < MAX_PATH_LENGTH) + { + if (pathName2Type.TryGetValue(parentDir.Name.ToUpperInvariant(), out ppt) && ppt != null) + { + return ppt.Value; + } + parentDir = parentDir.Parent; + } + return DEFAULT_PATH_TYPE; + } + + /// <summary> + /// Parse the text prepared in docBuf into a result DocData, + /// no synchronization is required. + /// </summary> + /// <param name="docData">Reusable result.</param> + /// <param name="name">Name that should be set to the result.</param> + /// <param name="trecSrc">Calling trec content source.</param> + /// <param name="docBuf">Text to parse.</param> + /// <param name="pathType">Type of parsed file, or <see cref="ParsePathType.UNKNOWN"/> if unknown - may be used by + /// parsers to alter their behavior according to the file path type. </param> + /// <returns></returns> + public abstract DocData Parse(DocData docData, string name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType); + + /// <summary> + /// strip tags from <code>buf</code>: each tag is replaced by a single blank. + /// </summary> + /// <returns>Text obtained when stripping all tags from <paramref name="buf"/> (input <see cref="StringBuilder"/> is unmodified).</returns> + public static string StripTags(StringBuilder buf, int start) + { + return StripTags(buf.ToString(start, buf.Length - start), 0); + } + + /// <summary> + /// Strip tags from input. + /// </summary> + /// <seealso cref="StripTags(StringBuilder, int)"/> + public static string StripTags(string buf, int start) + { + if (start > 0) + { + buf = buf.Substring(0); + } + return Regex.Replace(buf, "<[^>]*>", " "); + } + + /// <summary> + /// Extract from <paramref name="buf"/> the text of interest within specified tags. + /// </summary> + /// <param name="buf">Entire input text.</param> + /// <param name="startTag">Tag marking start of text of interest.</param> + /// <param name="endTag">Tag marking end of text of interest.</param> + /// <param name="maxPos">if ≥ 0 sets a limit on start of text of interest.</param> + /// <param name="noisePrefixes">Text of interest or null if not found.</param> + /// <returns></returns> + public static string Extract(StringBuilder buf, string startTag, string endTag, int maxPos, string[] noisePrefixes) + { + int k1 = buf.IndexOf(startTag); + if (k1 >= 0 && (maxPos < 0 || k1 < maxPos)) + { + k1 += startTag.Length; + int k2 = buf.IndexOf(endTag, k1); + if (k2 >= 0 && (maxPos < 0 || k2 < maxPos)) + { // found end tag with allowed range + if (noisePrefixes != null) + { + foreach (string noise in noisePrefixes) + { + int k1a = buf.IndexOf(noise, k1); + if (k1a >= 0 && k1a < k2) + { + k1 = k1a + noise.Length; + } + } + } + return buf.ToString(k1, k2 - k1).Trim(); + } + } + return null; + } + + //public static void main(String[] args) { + // System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0)); + //} + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFBISParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFBISParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFBISParser.cs new file mode 100644 index 0000000..cf321cc --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFBISParser.cs @@ -0,0 +1,68 @@ +using Lucene.Net.Support; +using System; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Parser for the FBIS docs in trec disks 4+5 collection format + /// </summary> + public class TrecFBISParser : TrecDocParser + { + private static readonly string HEADER = "<HEADER>"; + private static readonly string HEADER_END = "</HEADER>"; + private static readonly int HEADER_END_LENGTH = HEADER_END.Length; + + private static readonly string DATE1 = "<DATE1>"; + private static readonly string DATE1_END = "</DATE1>"; + + private static readonly string TI = "<TI>"; + private static readonly string TI_END = "</TI>"; + + public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) + { + int mark = 0; // that much is skipped + // optionally skip some of the text, set date, title + DateTime? date = null; + string title = null; + int h1 = docBuf.IndexOf(HEADER); + if (h1 >= 0) + { + int h2 = docBuf.IndexOf(HEADER_END, h1); + mark = h2 + HEADER_END_LENGTH; + // date... + string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null); + if (dateStr != null) + { + date = trecSrc.ParseDate(dateStr); + } + // title... + title = Extract(docBuf, TI, TI_END, h2, null); + } + docData.Clear(); + docData.Name = name; + docData.SetDate(date); + docData.Title = title; + docData.Body = StripTags(docBuf, mark).ToString(); + return docData; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFR94Parser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFR94Parser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFR94Parser.cs new file mode 100644 index 0000000..72f99bb --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFR94Parser.cs @@ -0,0 +1,69 @@ +using Lucene.Net.Support; +using System; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Parser for the FR94 docs in trec disks 4+5 collection format + /// </summary> + public class TrecFR94Parser : TrecDocParser + { + private static readonly string TEXT = "<TEXT>"; + private static readonly int TEXT_LENGTH = TEXT.Length; + private static readonly string TEXT_END = "</TEXT>"; + + private static readonly string DATE = "<DATE>"; + private static readonly string[] DATE_NOISE_PREFIXES = { + "DATE:", + "date:", //TODO improve date extraction for this format + "t.c.", + }; + private static readonly string DATE_END = "</DATE>"; + + //TODO can we also extract title for this format? + + public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) + { + int mark = 0; // that much is skipped + // optionally skip some of the text, set date (no title?) + DateTime? date = null; + int h1 = docBuf.IndexOf(TEXT); + if (h1 >= 0) + { + int h2 = docBuf.IndexOf(TEXT_END, h1); + mark = h1 + TEXT_LENGTH; + // date... + string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES); + if (dateStr != null) + { + dateStr = StripTags(dateStr, 0).ToString(); + date = trecSrc.ParseDate(dateStr.Trim()); + } + } + docData.Clear(); + docData.Name = name; + docData.SetDate(date); + docData.Body = StripTags(docBuf, mark).ToString(); + return docData; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFTParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFTParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFTParser.cs new file mode 100644 index 0000000..189f6cb --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecFTParser.cs @@ -0,0 +1,58 @@ +using System; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Parser for the FT docs in trec disks 4+5 collection format + /// </summary> + public class TrecFTParser : TrecDocParser + { + private static readonly string DATE = "<DATE>"; + private static readonly string DATE_END = "</DATE>"; + + private static readonly string HEADLINE = "<HEADLINE>"; + private static readonly string HEADLINE_END = "</HEADLINE>"; + + public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) + { + int mark = 0; // that much is skipped + + // date... + DateTime? date = null; + string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); + if (dateStr != null) + { + date = trecSrc.ParseDate(dateStr); + } + + // title... + string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); + + docData.Clear(); + docData.Name = name; + docData.SetDate(date); + docData.Title = title; + docData.Body = StripTags(docBuf, mark).ToString(); + return docData; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecGov2Parser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecGov2Parser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecGov2Parser.cs new file mode 100644 index 0000000..12912e9 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecGov2Parser.cs @@ -0,0 +1,57 @@ +using Lucene.Net.Support; +using System; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Parser for the GOV2 collection format + /// </summary> + public class TrecGov2Parser : TrecDocParser + { + private static readonly string DATE = "Date: "; + private static readonly string DATE_END = TrecContentSource.NEW_LINE; + + private static readonly string DOCHDR = "<DOCHDR>"; + private static readonly string TERMINATING_DOCHDR = "</DOCHDR>"; + + public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) + { + // skip some of the non-html text, optionally set date + DateTime? date = null; + int start = 0; + int h1 = docBuf.IndexOf(DOCHDR); + if (h1 >= 0) + { + int h2 = docBuf.IndexOf(TERMINATING_DOCHDR, h1); + string dateStr = Extract(docBuf, DATE, DATE_END, h2, null); + if (dateStr != null) + { + date = trecSrc.ParseDate(dateStr); + } + start = h2 + TERMINATING_DOCHDR.Length; + } + string html = docBuf.ToString(start, docBuf.Length - start); + return trecSrc.HtmlParser.Parse(docData, name, date, new StringReader(html), trecSrc); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecLATimesParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecLATimesParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecLATimesParser.cs new file mode 100644 index 0000000..e54f635 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecLATimesParser.cs @@ -0,0 +1,75 @@ +using System; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Parser for the FT docs in trec disks 4+5 collection format + /// </summary> + public class TrecLATimesParser : TrecDocParser + { + private static readonly string DATE = "<DATE>"; + private static readonly string DATE_END = "</DATE>"; + private static readonly string DATE_NOISE = "day,"; // anything aftre the ',' + + private static readonly string SUBJECT = "<SUBJECT>"; + private static readonly string SUBJECT_END = "</SUBJECT>"; + private static readonly string HEADLINE = "<HEADLINE>"; + private static readonly string HEADLINE_END = "</HEADLINE>"; + + public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) + { + int mark = 0; // that much is skipped + + // date... + DateTime? date = null; + string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); + if (dateStr != null) + { + int d2a = dateStr.IndexOf(DATE_NOISE); + if (d2a > 0) + { + dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part + } + dateStr = StripTags(dateStr, 0).ToString(); + date = trecSrc.ParseDate(dateStr.Trim()); + } + + // title... first try with SUBJECT, them with HEADLINE + string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null); + if (title == null) + { + title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); + } + if (title != null) + { + title = StripTags(title, 0).ToString().Trim(); + } + + docData.Clear(); + docData.Name = name; + docData.SetDate(date); + docData.Title = title; + docData.Body = StripTags(docBuf, mark).ToString(); + return docData; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecParserByPath.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecParserByPath.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecParserByPath.cs new file mode 100644 index 0000000..45a72b4 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/TrecParserByPath.cs @@ -0,0 +1,34 @@ +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Parser for trec docs which selects the parser to apply according + /// to the source files path, defaulting to <see cref="TrecGov2Parser"/>. + /// </summary> + public class TrecParserByPath : TrecDocParser + { + public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) + { + return pathType2parser[pathType].Parse(docData, name, trecSrc, docBuf, pathType); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs b/src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs new file mode 100644 index 0000000..e5b334c --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/PerfRunData.cs @@ -0,0 +1,490 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Benchmarks.ByTask.Feeds; +using Lucene.Net.Benchmarks.ByTask.Stats; +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Facet.Taxonomy; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Store; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; + +namespace Lucene.Net.Benchmarks.ByTask +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Data maintained by a performance test run. + /// </summary> + /// <remarks> + /// Data includes: + /// <list type="bullet"> + /// <item><description>Configuration.</description></item> + /// <item><description>Directory, Writer, Reader.</description></item> + /// <item><description>Taxonomy Directory, Writer, Reader.</description></item> + /// <item><description>DocMaker, FacetSource and a few instances of QueryMaker.</description></item> + /// <item><description>Named AnalysisFactories.</description></item> + /// <item><description>Analyzer.</description></item> + /// <item><description>Statistics data which updated during the run.</description></item> + /// </list> + /// <para/> + /// Config properties: + /// <list type="bullet"> + /// <item><term>work.dir</term><description><path to root of docs and index dirs| Default: work></description></item> + /// <item><term>analyzer</term><description><class name for analyzer| Default: StandardAnalyzer></description></item> + /// <item><term>doc.maker</term><description><class name for doc-maker| Default: DocMaker></description></item> + /// <item><term>facet.source</term><description><class name for facet-source| Default: RandomFacetSource></description></item> + /// <item><term>query.maker</term><description><class name for query-maker| Default: SimpleQueryMaker></description></item> + /// <item><term>log.queries</term><description><whether queries should be printed| Default: false></description></item> + /// <item><term>directory</term><description><type of directory to use for the index| Default: RAMDirectory></description></item> + /// <item><term>taxonomy.directory</term><description><type of directory for taxonomy index| Default: RAMDirectory></description></item> + /// </list> + /// </remarks> + public class PerfRunData : IDisposable + { + private Points points; + + // objects used during performance test run + // directory, analyzer, docMaker - created at startup. + // reader, writer, searcher - maintained by basic tasks. + private Store.Directory directory; + private IDictionary<string, AnalyzerFactory> analyzerFactories = new Dictionary<string, AnalyzerFactory>(); + private Analyzer analyzer; + private DocMaker docMaker; + private ContentSource contentSource; + private FacetSource facetSource; + private CultureInfo locale; + + private Store.Directory taxonomyDir; + private ITaxonomyWriter taxonomyWriter; + private TaxonomyReader taxonomyReader; + + // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately. + private IDictionary<Type, IQueryMaker> readTaskQueryMaker; + private Type qmkrClass; + + private DirectoryReader indexReader; + private IndexSearcher indexSearcher; + private IndexWriter indexWriter; + private Config config; + private long startTimeMillis; + + private readonly IDictionary<string, object> perfObjects = new Dictionary<string, object>(); + + // constructor + public PerfRunData(Config config) + { + this.config = config; + // analyzer (default is standard analyzer) + analyzer = NewAnalyzerTask.CreateAnalyzer(config.Get("analyzer", + "Lucene.Net.Analysis.Standard.StandardAnalyzer, Lucene.Net.Analysis.Common")); + + // content source + string sourceClass = config.Get("content.source", typeof(SingleDocSource).AssemblyQualifiedName); + contentSource = (ContentSource)Activator.CreateInstance(Type.GetType(sourceClass)); //Class.forName(sourceClass).asSubclass(typeof(ContentSource)).newInstance(); + contentSource.SetConfig(config); + + // doc maker + docMaker = (DocMaker)Activator.CreateInstance(Type.GetType(config.Get("doc.maker", typeof(DocMaker).AssemblyQualifiedName))); // "org.apache.lucene.benchmark.byTask.feeds.DocMaker")).asSubclass(DocMaker.class).newInstance(); + docMaker.SetConfig(config, contentSource); + // facet source + facetSource = (FacetSource)Activator.CreateInstance(Type.GetType(config.Get("facet.source", + typeof(RandomFacetSource).AssemblyQualifiedName))); // "org.apache.lucene.benchmark.byTask.feeds.RandomFacetSource")).asSubclass(FacetSource.class).newInstance(); + facetSource.SetConfig(config); + // query makers + readTaskQueryMaker = new Dictionary<Type, IQueryMaker>(); + qmkrClass = Type.GetType(config.Get("query.maker", typeof(SimpleQueryMaker).AssemblyQualifiedName)); + + // index stuff + Reinit(false); + + // statistic points + points = new Points(config); + + if (bool.Parse(config.Get("log.queries", "false"))) + { + SystemConsole.WriteLine("------------> queries:"); + SystemConsole.WriteLine(GetQueryMaker(new SearchTask(this)).PrintQueries()); + } + } + + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + IOUtils.Dispose(indexWriter, indexReader, directory, + taxonomyWriter, taxonomyReader, taxonomyDir, + docMaker, facetSource, contentSource); + + // close all perf objects that are closeable. + List<IDisposable> perfObjectsToClose = new List<IDisposable>(); + foreach (object obj in perfObjects.Values) + { + if (obj is IDisposable) + { + perfObjectsToClose.Add((IDisposable)obj); + } + } + IOUtils.Dispose(perfObjectsToClose); + } + } + + // clean old stuff, reopen + public virtual void Reinit(bool eraseIndex) + { + // cleanup index + IOUtils.Dispose(indexWriter, indexReader, directory); + indexWriter = null; + indexReader = null; + + IOUtils.Dispose(taxonomyWriter, taxonomyReader, taxonomyDir); + taxonomyWriter = null; + taxonomyReader = null; + + // directory (default is ram-dir). + directory = CreateDirectory(eraseIndex, "index", "directory"); + taxonomyDir = CreateDirectory(eraseIndex, "taxo", "taxonomy.directory"); + + // inputs + ResetInputs(); + + // release unused stuff + GC.Collect(); + + // Re-init clock + SetStartTimeMillis(); + } + + private Store.Directory CreateDirectory(bool eraseIndex, string dirName, + string dirParam) + { + if ("FSDirectory".Equals(config.Get(dirParam, "RAMDirectory"), StringComparison.Ordinal)) + { + DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work")); + DirectoryInfo indexDir = new DirectoryInfo(System.IO.Path.Combine(workDir.FullName, dirName)); + if (eraseIndex && indexDir.Exists) + { + FileUtils.FullyDelete(indexDir); + } + indexDir.Create(); + return FSDirectory.Open(indexDir); + } + + return new RAMDirectory(); + } + + /// <summary> + /// Returns an object that was previously set by <see cref="SetPerfObject(string, object)"/>. + /// </summary> + public virtual object GetPerfObject(string key) + { + lock (this) + { + object result; + perfObjects.TryGetValue(key, out result); + return result; + } + } + + /// <summary> + /// Sets an object that is required by <see cref="PerfTask"/>s, keyed by the given + /// <paramref name="key"/>. If the object implements <see cref="IDisposable"/>, it will be disposed + /// by <see cref="Dispose()"/>. + /// </summary> + public virtual void SetPerfObject(string key, object obj) + { + lock (this) + { + perfObjects[key] = obj; + } + } + + public virtual long SetStartTimeMillis() + { + startTimeMillis = Support.Time.CurrentTimeMilliseconds(); + return startTimeMillis; + } + + /// <summary> + /// Gets start time in milliseconds. + /// </summary> + public virtual long StartTimeMillis + { + get { return startTimeMillis; } + } + + /// <summary> + /// Gets the points. + /// </summary> + public virtual Points Points + { + get { return points; } + } + + /// <summary> + /// Gets or sets the directory. + /// </summary> + public virtual Store.Directory Directory + { + get { return directory; } + set { directory = value; } + } + + /// <summary> + /// Gets the taxonomy directory. + /// </summary> + public virtual Store.Directory TaxonomyDir + { + get { return taxonomyDir; } + } + + /// <summary> + /// Set the taxonomy reader. Takes ownership of that taxonomy reader, that is, + /// internally performs taxoReader.IncRef() (If caller no longer needs that + /// reader it should DecRef()/Dispose() it after calling this method, otherwise, + /// the reader will remain open). + /// </summary> + /// <param name="taxoReader">The taxonomy reader to set.</param> + public virtual void SetTaxonomyReader(TaxonomyReader taxoReader) + { + lock (this) + { + if (taxoReader == this.taxonomyReader) + { + return; + } + if (taxonomyReader != null) + { + taxonomyReader.DecRef(); + } + + if (taxoReader != null) + { + taxoReader.IncRef(); + } + this.taxonomyReader = taxoReader; + } + } + + /// <summary> + /// Returns the taxonomyReader. NOTE: this returns a + /// reference. You must call TaxonomyReader.DecRef() when + /// you're done. + /// </summary> + public virtual TaxonomyReader GetTaxonomyReader() + { + lock (this) + { + if (taxonomyReader != null) + { + taxonomyReader.IncRef(); + } + return taxonomyReader; + } + } + + /// <summary> + /// Gets or sets the taxonomy writer. + /// </summary> + public virtual ITaxonomyWriter TaxonomyWriter + { + get { return taxonomyWriter; } + set { taxonomyWriter = value; } + } + + /// <summary> + /// Returns the indexReader. NOTE: this returns a + /// reference. You must call IndexReader.DecRef() when + /// you're done. + /// </summary> + public virtual DirectoryReader GetIndexReader() + { + lock (this) + { + if (indexReader != null) + { + indexReader.IncRef(); + } + return indexReader; + } + } + + /// <summary> + /// Returns the indexSearcher. NOTE: this returns + /// a reference to the underlying IndexReader. You must + /// call IndexReader.DecRef() when you're done. + /// </summary> + /// <returns></returns> + public virtual IndexSearcher GetIndexSearcher() + { + lock (this) + { + if (indexReader != null) + { + indexReader.IncRef(); + } + return indexSearcher; + } + } + + /// <summary> + /// Set the index reader. Takes ownership of that index reader, that is, + /// internally performs indexReader.incRef() (If caller no longer needs that + /// reader it should decRef()/close() it after calling this method, otherwise, + /// the reader will remain open). + /// </summary> + /// <param name="indexReader">The indexReader to set.</param> + public virtual void SetIndexReader(DirectoryReader indexReader) + { + lock (this) + { + if (indexReader == this.indexReader) + { + return; + } + + if (this.indexReader != null) + { + // Release current IR + this.indexReader.DecRef(); + } + + this.indexReader = indexReader; + if (indexReader != null) + { + // Hold reference to new IR + indexReader.IncRef(); + indexSearcher = new IndexSearcher(indexReader); + } + else + { + indexSearcher = null; + } + } + } + + /// <summary> + /// Gets or sets the indexWriter. + /// </summary> + public virtual IndexWriter IndexWriter + { + get { return indexWriter; } + set { indexWriter = value; } + } + + /// <summary> + /// Gets or sets the analyzer. + /// </summary> + public virtual Analyzer Analyzer + { + get { return analyzer; } + set { analyzer = value; } + } + + /// <summary>Gets the <see cref="Feeds.ContentSource"/>.</summary> + public virtual ContentSource ContentSource + { + get { return contentSource; } + } + + /// <summary>Returns the <see cref="Feeds.DocMaker"/>.</summary> + public virtual DocMaker DocMaker + { + get { return docMaker; } + } + + /// <summary>Gets the <see cref="Feeds.FacetSource"/>.</summary> + public virtual FacetSource FacetSource + { + get { return facetSource; } + } + + /// <summary> + /// Gets or sets the culture. + /// </summary> + public virtual CultureInfo Locale // LUCENENET TODO: API Is this really needed since we have on the thread already? + { + get { return locale; } + set { locale = value; } + } + + /// <summary> + /// Gets the config. + /// </summary> + public virtual Config Config + { + get { return config; } + } + + public virtual void ResetInputs() + { + contentSource.ResetInputs(); + docMaker.ResetInputs(); + facetSource.ResetInputs(); + foreach (IQueryMaker queryMaker in readTaskQueryMaker.Values) + { + queryMaker.ResetInputs(); + } + } + + /// <summary> + /// Returns the queryMaker by read task type (class). + /// </summary> + public virtual IQueryMaker GetQueryMaker(ReadTask readTask) + { + lock (this) + { + // mapping the query maker by task class allows extending/adding new search/read tasks + // without needing to modify this class. + Type readTaskClass = readTask.GetType(); + IQueryMaker qm; + if (!readTaskQueryMaker.TryGetValue(readTaskClass, out qm) || qm == null) + { + try + { + //qm = qmkrClass.newInstance(); + qm = (IQueryMaker)Activator.CreateInstance(qmkrClass); + qm.SetConfig(config); + } + catch (Exception e) + { + throw new Exception(e.ToString(), e); + } + readTaskQueryMaker[readTaskClass] = qm; + } + return qm; + } + } + + public virtual IDictionary<string, AnalyzerFactory> AnalyzerFactories + { + get { return analyzerFactories; } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Programmatic/Sample.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Programmatic/Sample.cs b/src/Lucene.Net.Benchmark/ByTask/Programmatic/Sample.cs new file mode 100644 index 0000000..6b248f6 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Programmatic/Sample.cs @@ -0,0 +1,90 @@ +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Support; +using System.Collections.Generic; + +namespace Lucene.Net.Benchmarks.ByTask.Programmatic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Sample performance test written programmatically - no algorithm file is needed here. + /// </summary> + public class Sample + { + public static void Main(string[] args) + { + var p = InitProps(); + Config conf = new Config(p); + PerfRunData runData = new PerfRunData(conf); + + // 1. top sequence + TaskSequence top = new TaskSequence(runData, null, null, false); // top level, not parallel + + // 2. task to create the index + CreateIndexTask create = new CreateIndexTask(runData); + top.AddTask(create); + + // 3. task seq to add 500 docs (order matters - top to bottom - add seq to top, only then add to seq) + TaskSequence seq1 = new TaskSequence(runData, "AddDocs", top, false); + seq1.SetRepetitions(500); + seq1.SetNoChildReport(); + top.AddTask(seq1); + + // 4. task to add the doc + AddDocTask addDoc = new AddDocTask(runData); + //addDoc.setParams("1200"); // doc size limit if supported + seq1.AddTask(addDoc); // order matters 9see comment above) + + // 5. task to close the index + CloseIndexTask close = new CloseIndexTask(runData); + top.AddTask(close); + + // task to report + RepSumByNameTask rep = new RepSumByNameTask(runData); + top.AddTask(rep); + + // print algorithm + SystemConsole.WriteLine(top.ToString()); + + // execute + top.DoLogic(); + } + + // Sample programmatic settings. Could also read from file. + private static IDictionary<string, string> InitProps() + { + var p = new Dictionary<string, string>(); + p["task.max.depth.log"] = "3"; + p["max.buffered"] = "buf:10:10:100:100:10:10:100:100"; + p["doc.maker"] = "Lucene.Net.Benchmarks.ByTask.Feeds.ReutersContentSource, Lucene.Net.Benchmark"; + p["log.step"] = "2000"; + p["doc.delete.step"] = "8"; + p["analyzer"] = "Lucene.Net.Analysis.Standard.StandardAnalyzer, Lucene.Net.Analysis.Common"; + p["doc.term.vector"] = "false"; + p["directory"] = "FSDirectory"; + p["query.maker"] = "Lucene.Net.Benchmarks.ByTask.Feeds.ReutersQueryMaker, Lucene.Net.Benchmark"; + p["doc.stored"] = "true"; + p["docs.dir"] = "reuters-out"; + p["compound"] = "cmpnd:true:true:true:true:false:false:false:false"; + p["doc.tokenized"] = "true"; + p["merge.factor"] = "mrg:10:100:10:100:10:100:10:100"; + return p; + } + } +}
