http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/EnwikiContentSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/EnwikiContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/EnwikiContentSource.cs new file mode 100644 index 0000000..c9db5a2 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/EnwikiContentSource.cs @@ -0,0 +1,394 @@ +// LUCENENET TODO: Use HTML Agility pack instead of SAX ? + +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Support.Threading; +using Lucene.Net.Util; +using Sax.Net; +using Sax.Net.Helpers; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="ContentSource"/> which reads the English Wikipedia dump. You can read + /// the <c>.bz2</c> file directly (it will be decompressed on the fly). Config + /// properties: + /// <list type="bullet"> + /// <item><term>keep.image.only.docs</term><description>false|true (default <b>true</b>).</description></item> + /// <item><term>docs.file</term><description><path to the file></description></item> + /// </list> + /// </summary> + public class EnwikiContentSource : ContentSource + { + private class Parser : DefaultHandler//, IRunnable + { + private ThreadClass t; + private bool threadDone; + private bool stopped = false; + private string[] tuple; + private NoMoreDataException nmde; + private StringBuilder contents = new StringBuilder(); + private string title; + private string body; + private string time; + private string id; + + private readonly EnwikiContentSource outerInstance; + + public Parser(EnwikiContentSource outerInstance) + { + this.outerInstance = outerInstance; + } + + internal string[] Next() + { + if (t == null) + { + threadDone = false; + t = new ThreadClass(/*this*/); + t.SetDaemon(true); + t.Start(); + } + string[] result; + lock (this) + { + while (tuple == null && nmde == null && !threadDone && !stopped) + { + try + { + Monitor.Wait(this); + } + catch (ThreadInterruptedException ie) + { + throw new ThreadInterruptedException(ie.ToString(), ie); + } + } + if (tuple != null) + { + result = tuple; + tuple = null; + Monitor.Pulse(this);// notify(); + return result; + } + if (nmde != null) + { + // Set to null so we will re-start thread in case + // we are re-used: + t = null; + throw nmde; + } + // The thread has exited yet did not hit end of + // data, so this means it hit an exception. We + // throw NoMorDataException here to force + // benchmark to stop the current alg: + throw new NoMoreDataException(); + } + } + + internal string Time(string original) + { + StringBuilder buffer = new StringBuilder(); + + buffer.Append(original.Substring(8, 10 - 8)); + buffer.Append('-'); + buffer.Append(months[Convert.ToInt32(original.Substring(5, 7 - 5), CultureInfo.InvariantCulture) - 1]); + buffer.Append('-'); + buffer.Append(original.Substring(0, 4 - 0)); + buffer.Append(' '); + buffer.Append(original.Substring(11, 19 - 11)); + buffer.Append(".000"); + + return buffer.ToString(); + } + + public override void Characters(char[] ch, int start, int length) + { + contents.Append(ch, start, length); + } + + public override void EndElement(string @namespace, string simple, string qualified) + { + int elemType = GetElementType(qualified); + switch (elemType) + { + case PAGE: + // the body must be null and we either are keeping image docs or the + // title does not start with Image: + if (body != null && (outerInstance.keepImages || !title.StartsWith("Image:", StringComparison.Ordinal))) + { + string[] tmpTuple = new string[LENGTH]; + tmpTuple[TITLE] = title.Replace('\t', ' '); + tmpTuple[DATE] = time.Replace('\t', ' '); + tmpTuple[BODY] = Regex.Replace(body, "[\t\n]", " "); + tmpTuple[ID] = id; + lock (this) + { + while (tuple != null && !stopped) + { + try + { + Monitor.Wait(this); //wait(); + } + catch (ThreadInterruptedException ie) + { + throw new ThreadInterruptedException(ie.ToString(), ie); + } + } + tuple = tmpTuple; + Monitor.Pulse(this); //notify(); + } + } + break; + case BODY: + body = contents.ToString(); + //workaround that startswith doesn't have an ignore case option, get at least 20 chars. + string startsWith = body.Substring(0, Math.Min(10, contents.Length)).ToLowerInvariant(); + if (startsWith.StartsWith("#redirect", StringComparison.Ordinal)) + { + body = null; + } + break; + case DATE: + time = Time(contents.ToString()); + break; + case TITLE: + title = contents.ToString(); + break; + case ID: + //the doc id is the first one in the page. All other ids after that one can be ignored according to the schema + if (id == null) + { + id = contents.ToString(); + } + break; + default: + // this element should be discarded. + break; + } + } + + public void Run() + { + + try + { + Sax.Net.IXmlReader reader = XmlReaderFactory.Current.CreateXmlReader(); //XMLReaderFactory.createXMLReader(); + reader.ContentHandler = this; + reader.ErrorHandler = this; + while (!stopped) + { + Stream localFileIS = outerInstance.@is; + if (localFileIS != null) + { // null means fileIS was closed on us + try + { + // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader. + reader.Parse(new InputSource(IOUtils.GetDecodingReader(localFileIS, Encoding.UTF8))); + } + catch (IOException ioe) + { + lock (outerInstance) + { + if (localFileIS != outerInstance.@is) + { + // fileIS was closed on us, so, just fall through + } + else + // Exception is real + throw ioe; + } + } + } + lock (this) + { + if (stopped || !outerInstance.m_forever) + { + nmde = new NoMoreDataException(); + Monitor.Pulse(this); //notify(); + return; + } + else if (localFileIS == outerInstance.@is) + { + // If file is not already re-opened then re-open it now + outerInstance.@is = outerInstance.OpenInputStream(); + } + } + } + } + catch (SAXException sae) + { + throw new Exception(sae.ToString(), sae); + } + catch (IOException ioe) + { + throw new Exception(ioe.ToString(), ioe); + } + finally + { + lock (this) + { + threadDone = true; + Monitor.Pulse(this); //Notify(); + } + } + } + + public override void StartElement(string @namespace, string simple, string qualified, + IAttributes attributes) + { + int elemType = GetElementType(qualified); + switch (elemType) + { + case PAGE: + title = null; + body = null; + time = null; + id = null; + break; + // intentional fall-through. + case BODY: + case DATE: + case TITLE: + case ID: + contents.Length = 0; + break; + default: + // this element should be discarded. + break; + } + } + + internal void Stop() + { + lock (this) + { + stopped = true; + if (tuple != null) + { + tuple = null; + Monitor.Pulse(this); //Notify(); + } + } + } + + } + + private static readonly IDictionary<string, int?> ELEMENTS = new Dictionary<string, int?>(); + private const int TITLE = 0; + private const int DATE = TITLE + 1; + private const int BODY = DATE + 1; + private const int ID = BODY + 1; + private const int LENGTH = ID + 1; + // LENGTH is used as the size of the tuple, so whatever constants we need that + // should not be part of the tuple, we should define them after LENGTH. + private const int PAGE = LENGTH + 1; + + private static readonly string[] months = {"JAN", "FEB", "MAR", "APR", + "MAY", "JUN", "JUL", "AUG", + "SEP", "OCT", "NOV", "DEC"}; + + static EnwikiContentSource() + { + ELEMENTS["page"] = PAGE; + ELEMENTS["text"] = BODY; + ELEMENTS["timestamp"] = DATE; + ELEMENTS["title"] = TITLE; + ELEMENTS["id"] = ID; + } + + public EnwikiContentSource() + { + parser = new Parser(this); + } + + /// <summary> + /// Returns the type of the element if defined, otherwise returns -1. This + /// method is useful in startElement and endElement, by not needing to compare + /// the element qualified name over and over. + /// </summary> + private static int GetElementType(string elem) + { + int? val; + ELEMENTS.TryGetValue(elem, out val); + return val == null ? -1 : val.Value; + } + + private FileInfo file; + private bool keepImages = true; + private Stream @is; + private Parser parser; + + protected override void Dispose(bool disposing) + { + if (disposing) + { + lock (this) + { + parser.Stop(); + if (@is != null) + { + @is.Dispose(); + @is = null; + } + } + } + } + + public override DocData GetNextDocData(DocData docData) + { + string[] tuple = parser.Next(); + docData.Clear(); + docData.Name = tuple[ID]; + docData.Body = tuple[BODY]; + docData.SetDate(tuple[DATE]); + docData.Title = tuple[TITLE]; + return docData; + } + + public override void ResetInputs() + { + base.ResetInputs(); + @is = OpenInputStream(); + } + + /// <summary>Open the input stream.</summary> + protected virtual Stream OpenInputStream() + { + return StreamUtils.GetInputStream(file); + } + + public override void SetConfig(Config config) + { + base.SetConfig(config); + keepImages = config.Get("keep.image.only.docs", true); + string fileName = config.Get("docs.file", null); + if (fileName != null) + { + file = new FileInfo(fileName); + } + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/EnwikiQueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/EnwikiQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/EnwikiQueryMaker.cs new file mode 100644 index 0000000..77a86a1 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/EnwikiQueryMaker.cs @@ -0,0 +1,146 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Standard; +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Index; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Search; +using Lucene.Net.Search.Spans; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A QueryMaker that uses common and uncommon actual Wikipedia queries for + /// searching the English Wikipedia collection. 90 queries total. + /// </summary> + public class EnwikiQueryMaker : AbstractQueryMaker, IQueryMaker + { + // common and a few uncommon queries from wikipedia search logs + private static string[] STANDARD_QUERIES = { "Images catbox gif", + "Imunisasi haram", "Favicon ico", "Michael jackson", "Unknown artist", + "Lily Thai", "Neda", "The Last Song", "Metallica", "Nicola Tesla", + "Max B", "Skil Corporation", "\"The 100 Greatest Artists of All Time\"", + "\"Top 100 Global Universities\"", "Pink floyd", "Bolton Sullivan", + "Frank Lucas Jr", "Drake Woods", "Radiohead", "George Freeman", + "Oksana Grigorieva", "The Elder Scrolls V", "Deadpool", "Green day", + "\"Red hot chili peppers\"", "Jennifer Bini Taylor", + "The Paradiso Girls", "Queen", "3Me4Ph", "Paloma Jimenez", "AUDI A4", + "Edith Bouvier Beale: A Life In Pictures", "\"Skylar James Deleon\"", + "Simple Explanation", "Juxtaposition", "The Woody Show", "London WITHER", + "In A Dark Place", "George Freeman", "LuAnn de Lesseps", "Muhammad.", + "U2", "List of countries by GDP", "Dean Martin Discography", "Web 3.0", + "List of American actors", "The Expendables", + "\"100 Greatest Guitarists of All Time\"", "Vince Offer.", + "\"List of ZIP Codes in the United States\"", "Blood type diet", + "Jennifer Gimenez", "List of hobbies", "The beatles", "Acdc", + "Nightwish", "Iron maiden", "Murder Was the Case", "Pelvic hernia", + "Naruto Shippuuden", "campaign", "Enthesopathy of hip region", + "operating system", "mouse", + "List of Xbox 360 games without region encoding", "Shakepearian sonnet", + "\"The Monday Night Miracle\"", "India", "Dad's Army", + "Solanum melanocerasum", "\"List of PlayStation Portable Wi-Fi games\"", + "Little Pixie Geldof", "Planes, Trains & Automobiles", "Freddy Ingalls", + "The Return of Chef", "Nehalem", "Turtle", "Calculus", "Superman-Prime", + "\"The Losers\"", "pen-pal", "Audio stream input output", "lifehouse", + "50 greatest gunners", "Polyfecalia", "freeloader", "The Filthy Youth" }; + + private static Query[] GetPrebuiltQueries(string field) + { + WildcardQuery wcq = new WildcardQuery(new Term(field, "fo*")); + wcq.MultiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE; + // be wary of unanalyzed text + return new Query[] { + new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 5), + new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(field, "night")), + new SpanTermQuery(new Term(field, "trading")) }, 4, false), + new SpanNearQuery(new SpanQuery[] { + new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 10), + new SpanTermQuery(new Term(field, "credit")) }, 10, false), wcq, }; + } + + /// <summary> + /// Parse the strings containing Lucene queries. + /// </summary> + /// <param name="qs">array of strings containing query expressions</param> + /// <param name="a">analyzer to use when parsing queries</param> + /// <returns>array of Lucene queries</returns> + private static Query[] CreateQueries(IList<object> qs, Analyzer a) + { + QueryParser qp = new QueryParser( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT, +#pragma warning restore 612, 618 + DocMaker.BODY_FIELD, a); + IList<Query> queries = new List<Query>(); + for (int i = 0; i < qs.Count; i++) + { + try + { + + object query = qs[i]; + Query q = null; + if (query is string) + { + q = qp.Parse((string)query); + + } + else if (query is Query) + { + q = (Query)query; + + } + else + { + SystemConsole.WriteLine("Unsupported Query Type: " + query); + } + + if (q != null) + { + queries.Add(q); + } + + } + catch (Exception e) + { + SystemConsole.WriteLine(e.ToString()); + } + } + + return queries.ToArray(); + } + + protected override Query[] PrepareQueries() + { + // analyzer (default is standard analyzer) + Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(m_config.Get("analyzer", typeof(StandardAnalyzer).AssemblyQualifiedName)); + + List<object> queryList = new List<object>(20); + queryList.AddRange(STANDARD_QUERIES); + if (!m_config.Get("enwikiQueryMaker.disableSpanQueries", false)) + queryList.AddRange(GetPrebuiltQueries(DocMaker.BODY_FIELD)); + return CreateQueries(queryList, anlzr); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/FacetSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/FacetSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/FacetSource.cs new file mode 100644 index 0000000..51f0354 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/FacetSource.cs @@ -0,0 +1,47 @@ +using Lucene.Net.Facet; +using System.Collections.Generic; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Source items for facets. + /// <para/> + /// For supported configuration parameters see <see cref="ContentItemsSource"/>. + /// </summary> + public abstract class FacetSource : ContentItemsSource + { + /// <summary> + /// Fills the next facets content items in the given list. Implementations must + /// account for multi-threading, as multiple threads can call this method + /// simultaneously. + /// </summary> + public abstract void GetNextFacets(IList<FacetField> facets); + + public abstract void Configure(FacetsConfig config); + + public override void ResetInputs() + { + PrintStatistics("facets"); + // re-initiate since properties by round may have changed. + SetConfig(Config); + base.ResetInputs(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/FileBasedQueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/FileBasedQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/FileBasedQueryMaker.cs new file mode 100644 index 0000000..bf287e0 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/FileBasedQueryMaker.cs @@ -0,0 +1,121 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Search; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.IO; +using System.Reflection; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Create queries from a <see cref="FileStream"/>. One per line, pass them through the + /// QueryParser. Lines beginning with # are treated as comments. + /// </summary> + /// <remarks> + /// File can be specified as a absolute, relative or resource. + /// Two properties can be set: + /// <list type="bullet"> + /// <item><term>file.query.maker.file</term><description><Full path to file containing queries></description></item> + /// <item><term>file.query.maker.default.field</term><description><Name of default field - Default value is "body"></description></item> + /// </list> + /// <para/> + /// Example: + /// <code> + /// file.query.maker.file=c:/myqueries.txt + /// file.query.maker.default.field=body + /// </code> + /// </remarks> + public class FileBasedQueryMaker : AbstractQueryMaker, IQueryMaker + { + protected override Query[] PrepareQueries() + { + Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(m_config.Get("analyzer", + "Lucene.Net.Analysis.Standard.StandardAnalyzer, Lucene.Net.Analysis.Common")); + string defaultField = m_config.Get("file.query.maker.default.field", DocMaker.BODY_FIELD); + QueryParser qp = new QueryParser( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT, +#pragma warning restore 612, 618 + defaultField, anlzr); + qp.AllowLeadingWildcard = true; + + List<Query> qq = new List<Query>(); + string fileName = m_config.Get("file.query.maker.file", null); + if (fileName != null) + { + FileInfo file = new FileInfo(fileName); + TextReader reader = null; + // note: we use a decoding reader, so if your queries are screwed up you know + if (file.Exists) + { + reader = IOUtils.GetDecodingReader(file, Encoding.UTF8); + } + else + { + //see if we can find it as a resource + Stream asStream = typeof(FileBasedQueryMaker).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(FileBasedQueryMaker), fileName); + if (asStream != null) + { + reader = IOUtils.GetDecodingReader(asStream, Encoding.UTF8); + } + } + if (reader != null) + { + try + { + string line = null; + int lineNum = 0; + while ((line = reader.ReadLine()) != null) + { + line = line.Trim(); + if (line.Length != 0 && !line.StartsWith("#", StringComparison.Ordinal)) + { + try + { + qq.Add(qp.Parse(line)); + } + catch (ParseException e) + { + SystemConsole.Error.WriteLine("Exception: " + e.Message + " occurred while parsing line: " + lineNum + " Text: " + line); + } + } + lineNum++; + } + } + finally + { + reader.Dispose(); + } + } + else + { + SystemConsole.Error.WriteLine("No Reader available for: " + fileName); + } + + } + return qq.ToArray(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/GeonamesLineParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/GeonamesLineParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/GeonamesLineParser.cs new file mode 100644 index 0000000..d786a91 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/GeonamesLineParser.cs @@ -0,0 +1,53 @@ +using System; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A line parser for Geonames.org data. + /// See <a href="http://download.geonames.org/export/dump/readme.txt">'geoname' table</a>. + /// Requires <see cref="SpatialDocMaker"/>. + /// </summary> + public class GeonamesLineParser : LineParser + { + /// <summary> + /// This header will be ignored; the geonames format is fixed and doesn't have a header line. + /// </summary> + public GeonamesLineParser(string[] header) + : base(header) + { + } + + public override void ParseLine(DocData docData, string line) + { + string[] parts = new Regex("\\t").Split(line, 7);//no more than first 6 fields needed + + // Sample data line: + // 3578267, Morne du Vitet, Morne du Vitet, 17.88333, -62.8, ... + // ID, Name, Alternate name (unused), Lat, Lon, ... + + docData.ID = Convert.ToInt32(parts[0]);//note: overwrites ID assigned by LineDocSource + docData.Name = parts[1]; + string latitude = parts[4]; + string longitude = parts[5]; + docData.Body = "POINT(" + longitude + " " + latitude + ")";//WKT is x y order + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/HTMLParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/HTMLParser.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/HTMLParser.cs new file mode 100644 index 0000000..75e683b --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/HTMLParser.cs @@ -0,0 +1,42 @@ +using System; +using System.IO; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// HTML Parsing Interface for test purposes. + /// </summary> + public interface IHTMLParser + { + /// <summary> + /// Parse the input TextReader and return DocData. + /// The provided name, title, date are used for the result, unless when they're null, + /// in which case an attempt is made to set them from the parsed data. + /// </summary> + /// <param name="docData">Result reused.</param> + /// <param name="name">Name of the result doc data.</param> + /// <param name="date">Date of the result doc data. If null, attempt to set by parsed data.</param> + /// <param name="reader">Reader of html text to parse.</param> + /// <param name="trecSrc">The <see cref="TrecContentSource"/> used to parse dates.</param> + /// <returns>Parsed doc data.</returns> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + DocData Parse(DocData docData, string name, DateTime? date, TextReader reader, TrecContentSource trecSrc); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/LineDocSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/LineDocSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/LineDocSource.cs new file mode 100644 index 0000000..1a45cc6 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/LineDocSource.cs @@ -0,0 +1,328 @@ +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="ContentSource"/> reading one line at a time as a + /// <see cref="Documents.Document"/> from a single file. This saves IO + /// cost (over DirContentSource) of recursing through a directory and opening a + /// new file for every document. + /// </summary> + /// <remarks> + /// The expected format of each line is (arguments are separated by <TAB>): + /// <i>title, date, body</i>. If a line is read in a different format, a + /// <see cref="Exception"/> will be thrown. In general, you should use this + /// content source for files that were created with <see cref="WriteLineDocTask"/>. + /// </remarks> + public class LineDocSource : ContentSource + { + // LUCENENET specific - de-nested LineParser, SimpleLineParser, HeaderLineParser + + private FileInfo file; + private TextReader reader; + private int readCount; + + private LineParser docDataLineReader = null; + private bool skipHeaderLine = false; + + private void OpenFile() + { + try + { + if (reader != null) + { + reader.Dispose(); + } + Stream @is = StreamUtils.GetInputStream(file); + reader = new StreamReader(@is, m_encoding); + if (skipHeaderLine) + { + reader.ReadLine(); // skip one line - the header line - already handled that info + } + } + catch (IOException e) + { + throw new Exception(e.ToString(), e); + } + } + + protected override void Dispose(bool disposing) + { + if (disposing && reader != null) + { + reader.Dispose(); + reader = null; + } + } + + public override DocData GetNextDocData(DocData docData) + { + string line; + int myID; + + + lock (this) + { + line = reader.ReadLine(); + if (line == null) + { + if (!m_forever) + { + throw new NoMoreDataException(); + } + // Reset the file + OpenFile(); + return GetNextDocData(docData); + } + if (docDataLineReader == null) + { // first line ever, one time initialization, + docDataLineReader = CreateDocDataLineReader(line); + if (skipHeaderLine) + { + return GetNextDocData(docData); + } + } + // increment IDS only once... + myID = readCount++; + } + + // The date String was written in the format of DateTools.dateToString. + docData.Clear(); + docData.ID = myID; + docDataLineReader.ParseLine(docData, line); + return docData; + } + + private LineParser CreateDocDataLineReader(string line) + { + string[] header; + string headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP; + + if (line.StartsWith(headIndicator, StringComparison.Ordinal)) + { + header = line.Substring(headIndicator.Length).Split(new char[] { WriteLineDocTask.SEP }).TrimEnd(); + skipHeaderLine = true; // mark to skip the header line when input file is reopened + } + else + { + header = WriteLineDocTask.DEFAULT_FIELDS; + } + + // if a specific DocDataLineReader was configured, must respect it + string docDataLineReaderClassName = Config.Get("line.parser", null); + if (docDataLineReaderClassName != null) + { + try + { + Type clazz = Type.GetType(docDataLineReaderClassName); + return (LineParser)Activator.CreateInstance(clazz, (object)header); + } + catch (Exception e) + { + throw new Exception("Failed to instantiate " + docDataLineReaderClassName, e); + } + } + + // if this the simple case, + if (Arrays.Equals(header, WriteLineDocTask.DEFAULT_FIELDS)) + { + return new SimpleLineParser(header); + } + return new HeaderLineParser(header); + } + + public override void ResetInputs() + { + base.ResetInputs(); + OpenFile(); + } + + public override void SetConfig(Config config) + { + base.SetConfig(config); + string fileName = config.Get("docs.file", null); + if (fileName == null) + { + throw new ArgumentException("docs.file must be set"); + } + file = new FileInfo(fileName); + if (m_encoding == null) + { + m_encoding = Encoding.UTF8; + } + } + } + + /// <summary>Reader of a single input line into <see cref="DocData"/>.</summary> + public abstract class LineParser + { + protected readonly string[] m_header; + + /// <summary> + /// Construct with the header + /// </summary> + /// <param name="header">header line found in the input file, or <c>null</c> if none.</param> + public LineParser(string[] header) + { + this.m_header = header; + } + + /// <summary> + /// parse an input line and fill doc data appropriately + /// </summary> + public abstract void ParseLine(DocData docData, string line); + } + + /// <summary> + /// <see cref="LineParser"/> which ignores the header passed to its constructor + /// and assumes simply that field names and their order are the same + /// as in <see cref="WriteLineDocTask.DEFAULT_FIELDS"/>. + /// </summary> + public class SimpleLineParser : LineParser + { + public SimpleLineParser(string[] header) + : base(header) + { + } + + public override void ParseLine(DocData docData, string line) + { + int k1 = 0; + int k2 = line.IndexOf(WriteLineDocTask.SEP, k1); + if (k2 < 0) + { + throw new Exception("line: [" + line + "] is in an invalid format (missing: separator title::date)!"); + } + docData.Title = line.Substring(k1, k2 - k1); + k1 = k2 + 1; + k2 = line.IndexOf(WriteLineDocTask.SEP, k1); + if (k2 < 0) + { + throw new Exception("line: [" + line + "] is in an invalid format (missing: separator date::body)!"); + } + docData.SetDate(line.Substring(k1, k2 - k1)); + k1 = k2 + 1; + k2 = line.IndexOf(WriteLineDocTask.SEP, k1); + if (k2 >= 0) + { + throw new Exception("line: [" + line + "] is in an invalid format (too many separators)!"); + } + // last one + docData.Body = line.Substring(k1); + } + } + + /// <summary> + /// <see cref="LineParser"/> which sets field names and order by + /// the header - any header - of the lines file. + /// It is less efficient than <see cref="SimpleLineParser"/> but more powerful. + /// </summary> + public class HeaderLineParser : LineParser + { + private enum FieldName { NAME, TITLE, DATE, BODY, PROP } + private readonly FieldName[] posToF; + public HeaderLineParser(string[] header) + : base(header) + { + posToF = new FieldName[header.Length]; + for (int i = 0; i < header.Length; i++) + { + String f = header[i]; + if (DocMaker.NAME_FIELD.Equals(f, StringComparison.Ordinal)) + { + posToF[i] = FieldName.NAME; + } + else if (DocMaker.TITLE_FIELD.Equals(f, StringComparison.Ordinal)) + { + posToF[i] = FieldName.TITLE; + } + else if (DocMaker.DATE_FIELD.Equals(f, StringComparison.Ordinal)) + { + posToF[i] = FieldName.DATE; + } + else if (DocMaker.BODY_FIELD.Equals(f, StringComparison.Ordinal)) + { + posToF[i] = FieldName.BODY; + } + else + { + posToF[i] = FieldName.PROP; + } + } + } + + public override void ParseLine(DocData docData, string line) + { + int n = 0; + int k1 = 0; + int k2; + while ((k2 = line.IndexOf(WriteLineDocTask.SEP, k1)) >= 0) + { + if (n >= m_header.Length) + { + throw new Exception("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]"); + } + SetDocDataField(docData, n, line.Substring(k1, k2 - k1)); + ++n; + k1 = k2 + 1; + } + if (n != m_header.Length - 1) + { + throw new Exception("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]"); + } + // last one + SetDocDataField(docData, n, line.Substring(k1)); + } + + private void SetDocDataField(DocData docData, int position, string text) + { + switch (posToF[position]) + { + case FieldName.NAME: + docData.Name = text; + break; + case FieldName.TITLE: + docData.Title = text; + break; + case FieldName.DATE: + docData.SetDate(text); + break; + case FieldName.BODY: + docData.Body = text; + break; + case FieldName.PROP: + var p = docData.Props; + if (p == null) + { + p = new Dictionary<string, string>(); + docData.Props = p; + } + p[m_header[position]] = text; + break; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/LongToEnglishContentSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/LongToEnglishContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/LongToEnglishContentSource.cs new file mode 100644 index 0000000..fadab82 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/LongToEnglishContentSource.cs @@ -0,0 +1,72 @@ +using System; +using System.Globalization; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Creates documents whose content is a <see cref="long"/> number starting from + /// <c><see cref="long.MinValue"/> + 10</c>. + /// </summary> + public class Int64ToEnglishContentSource : ContentSource + { + private long counter = 0; + + protected override void Dispose(bool disposing) + { + } + + // TODO: we could take param to specify locale... + //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT, + // RuleBasedNumberFormat.SPELLOUT); + public override DocData GetNextDocData(DocData docData) + { + lock (this) + { + docData.Clear(); + // store the current counter to avoid synchronization later on + long curCounter; + lock (this) + { + curCounter = counter; + if (counter == long.MaxValue) + { + counter = long.MinValue;//loop around + } + else + { + ++counter; + } + } + + // LUCENENET TODO: Rules based number formatting...(from ICU) + docData.Body = curCounter.ToString(); //rnbf.format(curCounter); + docData.Name = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture); + docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture); + docData.SetDate(new DateTime()); + return docData; + } + } + + public override void ResetInputs() + { + counter = long.MinValue + 10; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/LongToEnglishQueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/LongToEnglishQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/LongToEnglishQueryMaker.cs new file mode 100644 index 0000000..f565eb8 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/LongToEnglishQueryMaker.cs @@ -0,0 +1,89 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Standard; +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Search; +using Lucene.Net.Util; +using System; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Creates queries whose content is a spelled-out <see cref="long"/> number + /// starting from <c><see cref="long.MinValue"/> + 10</c>. + /// </summary> + public class Int64ToEnglishQueryMaker : IQueryMaker + { + long counter = long.MinValue + 10; + protected QueryParser m_parser; + + //// TODO: we could take param to specify locale... + //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT, + // RuleBasedNumberFormat.SPELLOUT); + + public virtual Query MakeQuery(int size) + { + throw new NotSupportedException(); + } + + public virtual Query MakeQuery() + { + lock (this) + { + // LUCENENET TODO: Rules based number formatter (from ICU) + //return parser.Parse("" + rnbf.format(GetNextCounter()) + ""); + return m_parser.Parse(GetNextCounter().ToString()); + } + } + + private long GetNextCounter() + { + lock (this) + { + if (counter == long.MaxValue) + { + counter = long.MinValue + 10; + } + return counter++; + } + } + + public virtual void SetConfig(Config config) + { + Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(config.Get("analyzer", typeof(StandardAnalyzer).Name)); + m_parser = new QueryParser( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT, +#pragma warning restore 612, 618 + DocMaker.BODY_FIELD, anlzr); + } + + public virtual void ResetInputs() + { + counter = long.MinValue + 10; + } + + public virtual string PrintQueries() + { + return "LongToEnglish: [" + long.MinValue + " TO " + counter + "]"; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/NoMoreDataException.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/NoMoreDataException.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/NoMoreDataException.cs new file mode 100644 index 0000000..a7bfbad --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/NoMoreDataException.cs @@ -0,0 +1,50 @@ +using System; +#if FEATURE_SERIALIZABLE +using System.Runtime.Serialization; +#endif + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Exception indicating there is no more data. + /// Thrown by Docs Makers if <c>doc.maker.forever</c> is <c>false</c> and docs sources of that maker where exhausted. + /// This is useful for iterating all document of a source, in case we don't know in advance how many docs there are. + /// </summary> +#if FEATURE_SERIALIZABLE + [Serializable] +#endif + public class NoMoreDataException : Exception + { + public NoMoreDataException() + { } + +#if FEATURE_SERIALIZABLE + /// <summary> + /// Initializes a new instance of this class with serialized data. + /// </summary> + /// <param name="info">The <see cref="SerializationInfo"/> that holds the serialized object data about the exception being thrown.</param> + /// <param name="context">The <see cref="StreamingContext"/> that contains contextual information about the source or destination.</param> + public NoMoreDataException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + } +#endif + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/QueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/QueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/QueryMaker.cs new file mode 100644 index 0000000..2ece812 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/QueryMaker.cs @@ -0,0 +1,48 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Search; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Create queries for the test. + /// </summary> + public interface IQueryMaker + { + /// <summary> + /// Create the next query, of the given size. + /// </summary> + /// <param name="size">The size of the query - number of terms, etc.</param> + /// <returns></returns> + /// <exception cref="Exception">If cannot make the query, or if size > 0 was specified but this feature is not supported.</exception> + Query MakeQuery(int size); + + /// <summary>Create the next query</summary> + Query MakeQuery(); + + /// <summary>Set the properties</summary> + void SetConfig(Config config); + + /// <summary>Reset inputs so that the test run would behave, input wise, as if it just started.</summary> + void ResetInputs(); + + /// <summary>Print the queries</summary> + string PrintQueries(); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/RandomFacetSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/RandomFacetSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/RandomFacetSource.cs new file mode 100644 index 0000000..b956570 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/RandomFacetSource.cs @@ -0,0 +1,109 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Facet; +using System; +using System.Collections.Generic; +using System.Globalization; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple implementation of a random facet source. + /// </summary> + /// <remarks> + /// Supports the following parameters: + /// <list type="bullet"> + /// <item><term>rand.seed</term><description>defines the seed to initialize <see cref="Random"/> with (default: <b>13</b>).</description></item> + /// <item><term>max.doc.facet.dims</term><description>Max number of random dimensions to create (default: <b>5</b>); + /// actual number of dimensions would be anything between 1 and that number.</description></item> + /// <item><term>max.doc.facets</term><description>maximal #facets per doc (default: <b>10</b>). + /// Actual number of facets in a certain doc would be anything between 1 and that number. + /// </description></item> + /// <item><term>max.facet.depth</term><description>maximal #components in a facet (default: + /// <b>3</b>). Actual number of components in a certain facet would be anything + /// between 1 and that number. + /// </description></item> + /// </list> + /// </remarks> + public class RandomFacetSource : FacetSource + { + private Random random; + private int maxDocFacets; + private int maxFacetDepth; + private int maxDims; + private int maxValue; // = maxDocFacets * maxFacetDepth; + + public override void GetNextFacets(IList<FacetField> facets) + { + facets.Clear(); + int numFacets = 1 + random.Next(maxDocFacets); // at least one facet to each doc + for (int i = 0; i < numFacets; i++) + { + int depth; + if (maxFacetDepth == 2) + { + depth = 2; + } + else + { + depth = 2 + random.Next(maxFacetDepth - 2); // depth < 2 is not useful + } + + string dim = random.Next(maxDims).ToString(CultureInfo.InvariantCulture); + string[] components = new string[depth - 1]; + for (int k = 0; k < depth - 1; k++) + { + components[k] = random.Next(maxValue).ToString(CultureInfo.InvariantCulture); + AddItem(); + } + FacetField ff = new FacetField(dim, components); + facets.Add(ff); + AddBytes(ff.ToString().Length); // very rough approximation + } + } + + public override void Configure(FacetsConfig config) + { + for (int i = 0; i < maxDims; i++) + { + config.SetHierarchical(i.ToString(CultureInfo.InvariantCulture), true); + config.SetMultiValued(i.ToString(CultureInfo.InvariantCulture), true); + } + } + + protected override void Dispose(bool disposing) + { + // nothing to do here + } + + public override void SetConfig(Config config) + { + base.SetConfig(config); + random = new Random(config.Get("rand.seed", 13)); + maxDocFacets = config.Get("max.doc.facets", 10); + maxDims = config.Get("max.doc.facets.dims", 5); + maxFacetDepth = config.Get("max.facet.depth", 3); + if (maxFacetDepth < 2) + { + throw new ArgumentException("max.facet.depth must be at least 2; got: " + maxFacetDepth); + } + maxValue = maxDocFacets * maxFacetDepth; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/ReutersContentSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ReutersContentSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/ReutersContentSource.cs new file mode 100644 index 0000000..c61ce2f --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ReutersContentSource.cs @@ -0,0 +1,140 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="ContentSource"/> reading from the Reuters collection. + /// <para/> + /// Config properties: + /// <list type="bullet"> + /// <item><term><b>work.dir</b></term><description>path to the root of docs and indexes dirs (default <b>work</b>).</description></item> + /// <item><term><b>docs.dir</b></term><description>path to the docs dir (default <b>reuters-out</b>).</description></item> + /// </list> + /// </summary> + public class ReutersContentSource : ContentSource + { + // LUCENENET specific: DateFormatInfo not used + + private DirectoryInfo dataDir = null; + private List<FileInfo> inputFiles = new List<FileInfo>(); + private int nextFile = 0; + private int iteration = 0; + + public override void SetConfig(Config config) + { + base.SetConfig(config); + DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work")); + string d = config.Get("docs.dir", "reuters-out"); + dataDir = new DirectoryInfo(d); + inputFiles.Clear(); + CollectFiles(dataDir, inputFiles); + if (inputFiles.Count == 0) + { + throw new Exception("No txt files in dataDir: " + dataDir.FullName); + } + } + + // LUCENENET specific: DateFormatInfo not used + + private DateTime? ParseDate(string dateStr) + { + DateTime temp; + if (DateTime.TryParseExact(dateStr, "dd-MMM-yyyy hh:mm:ss.fff", CultureInfo.InvariantCulture, DateTimeStyles.None, out temp)) + { + return temp; + } + else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp)) + { + return temp; + } + + return null; + } + + protected override void Dispose(bool disposing) + { + // TODO implement? + } + + public override DocData GetNextDocData(DocData docData) + { + FileInfo f = null; + string name = null; + lock (this) + { + if (nextFile >= inputFiles.Count) + { + // exhausted files, start a new round, unless forever set to false. + if (!m_forever) + { + throw new NoMoreDataException(); + } + nextFile = 0; + iteration++; + } + f = inputFiles[nextFile++]; + name = f.FullName + "_" + iteration; + } + + using (TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8)) + { + // First line is the date, 3rd is the title, rest is body + string dateStr = reader.ReadLine(); + reader.ReadLine();// skip an empty line + string title = reader.ReadLine(); + reader.ReadLine();// skip an empty line + StringBuilder bodyBuf = new StringBuilder(1024); + string line = null; + while ((line = reader.ReadLine()) != null) + { + bodyBuf.Append(line).Append(' '); + } + reader.Dispose(); + + + AddBytes(f.Length); + + DateTime? date = ParseDate(dateStr.Trim()); + + docData.Clear(); + docData.Name = name; + docData.Body = bodyBuf.ToString(); + docData.Title = title; + docData.SetDate(date); + return docData; + } + } + + public override void ResetInputs() + { + lock (this) + { + base.ResetInputs(); + nextFile = 0; + iteration = 0; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/ReutersQueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ReutersQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/ReutersQueryMaker.cs new file mode 100644 index 0000000..a53a2ec --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ReutersQueryMaker.cs @@ -0,0 +1,126 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Index; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Search; +using Lucene.Net.Search.Spans; +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="IQueryMaker"/> that makes queries devised manually (by Grant Ingersoll) for + /// searching in the Reuters collection. + /// </summary> + public class ReutersQueryMaker : AbstractQueryMaker, IQueryMaker + { + private static string[] STANDARD_QUERIES = { + //Start with some short queries + "Salomon", "Comex", "night trading", "Japan Sony", + //Try some Phrase Queries + "\"Sony Japan\"", "\"food needs\"~3", + "\"World Bank\"^2 AND Nigeria", "\"World Bank\" -Nigeria", + "\"Ford Credit\"~5", + //Try some longer queries + "airline Europe Canada destination", + "Long term pressure by trade " + + "ministers is necessary if the current Uruguay round of talks on " + + "the General Agreement on Trade and Tariffs (GATT) is to " + + "succeed" + }; + + private static Query[] GetPrebuiltQueries(string field) + { + // be wary of unanalyzed text + return new Query[] { + new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 5), + new SpanNearQuery(new SpanQuery[]{new SpanTermQuery(new Term(field, "night")), new SpanTermQuery(new Term(field, "trading"))}, 4, false), + new SpanNearQuery(new SpanQuery[]{new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 10), new SpanTermQuery(new Term(field, "credit"))}, 10, false), + new WildcardQuery(new Term(field, "fo*")), + }; + } + + /// <summary> + /// Parse the strings containing Lucene queries. + /// </summary> + /// <param name="qs">array of strings containing query expressions</param> + /// <param name="a">analyzer to use when parsing queries</param> + /// <returns>array of Lucene queries</returns> + private static Query[] CreateQueries(IList<object> qs, Analyzer a) + { + QueryParser qp = new QueryParser( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT, +#pragma warning restore 612, 618 + DocMaker.BODY_FIELD, a); + List<Query> queries = new List<Query>(); + for (int i = 0; i < qs.Count; i++) + { + try + { + + object query = qs[i]; + Query q = null; + if (query is string) + { + q = qp.Parse((string)query); + + } + else if (query is Query) + { + q = (Query)query; + + } + else + { + SystemConsole.Error.WriteLine("Unsupported Query Type: " + query); + } + + if (q != null) + { + queries.Add(q); + } + + } + catch (Exception e) + { + SystemConsole.Error.WriteLine(e.ToString()); + } + } + + return queries.ToArray(); + } + + protected override Query[] PrepareQueries() + { + // analyzer (default is standard analyzer) + Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(m_config.Get("analyzer", + "Lucene.Net.Analysis.Standard.StandardAnalyzer, Lucene.Net.Analysis.Common")); + + List<object> queryList = new List<object>(20); + queryList.AddRange(STANDARD_QUERIES); + queryList.AddRange(GetPrebuiltQueries(DocMaker.BODY_FIELD)); + return CreateQueries(queryList, anlzr); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SimpleQueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SimpleQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SimpleQueryMaker.cs new file mode 100644 index 0000000..4ba2fe3 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SimpleQueryMaker.cs @@ -0,0 +1,70 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Index; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Search; +using Lucene.Net.Util; +using System.Collections.Generic; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="IQueryMaker"/> that makes queries for a collection created + /// using <see cref="SingleDocSource"/>. + /// </summary> + public class SimpleQueryMaker : AbstractQueryMaker, IQueryMaker + { + /// <summary> + /// Prepare the queries for this test. + /// Extending classes can override this method for preparing different queries. + /// </summary> + /// <returns>Prepared queries.</returns> + /// <exception cref="System.Exception">If cannot prepare the queries.</exception> + protected override Query[] PrepareQueries() + { + // analyzer (default is standard analyzer) + Analyzer anlzr = NewAnalyzerTask.CreateAnalyzer(m_config.Get("analyzer", + "Lucene.Net.Analysis.Standard.StandardAnalyzer, Lucene.Net.Analysis.Common")); + + QueryParser qp = new QueryParser( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT, +#pragma warning restore 612, 618 + DocMaker.BODY_FIELD, anlzr); + List<Query> qq = new List<Query>(); + Query q1 = new TermQuery(new Term(DocMaker.ID_FIELD, "doc2")); + qq.Add(q1); + Query q2 = new TermQuery(new Term(DocMaker.BODY_FIELD, "simple")); + qq.Add(q2); + BooleanQuery bq = new BooleanQuery(); + bq.Add(q1, Occur.MUST); + bq.Add(q2, Occur.MUST); + qq.Add(bq); + qq.Add(qp.Parse("synthetic body")); + qq.Add(qp.Parse("\"synthetic body\"")); + qq.Add(qp.Parse("synthetic text")); + qq.Add(qp.Parse("\"synthetic text\"")); + qq.Add(qp.Parse("\"synthetic text\"~3")); + qq.Add(qp.Parse("zoom*")); + qq.Add(qp.Parse("synth*")); + return qq.ToArray(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SimpleSloppyPhraseQueryMaker.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SimpleSloppyPhraseQueryMaker.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SimpleSloppyPhraseQueryMaker.cs new file mode 100644 index 0000000..7208b25 --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SimpleSloppyPhraseQueryMaker.cs @@ -0,0 +1,88 @@ +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Support; +using System.Collections.Generic; +using System.Linq; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Create sloppy phrase queries for performance test, in an index created using simple doc maker. + /// </summary> + public class SimpleSloppyPhraseQueryMaker : SimpleQueryMaker + { + /// <seealso cref="SimpleQueryMaker.PrepareQueries()"/> + protected override Query[] PrepareQueries() + { + // extract some 100 words from doc text to an array + string[] words; + List<string> w = new List<string>(); + StringTokenizer st = new StringTokenizer(SingleDocSource.DOC_TEXT); + while (st.HasMoreTokens() && w.Count < 100) + { + w.Add(st.NextToken()); + } + words = w.ToArray(); + + // create queries (that would find stuff) with varying slops + IList<Query> queries = new List<Query>(); + for (int slop = 0; slop < 8; slop++) + { + for (int qlen = 2; qlen < 6; qlen++) + { + for (int wd = 0; wd < words.Length - qlen - slop; wd++) + { + // ordered + int remainedSlop = slop; + PhraseQuery q = new PhraseQuery(); + q.Slop = slop; + int wind = wd; + for (int i = 0; i < qlen; i++) + { + q.Add(new Term(DocMaker.BODY_FIELD, words[wind++])); + if (remainedSlop > 0) + { + remainedSlop--; + wind++; + } + } + queries.Add(q); + // reversed + remainedSlop = slop; + q = new PhraseQuery(); + q.Slop = slop + 2 * qlen; + wind = wd + qlen + remainedSlop - 1; + for (int i = 0; i < qlen; i++) + { + q.Add(new Term(DocMaker.BODY_FIELD, words[wind--])); + if (remainedSlop > 0) + { + remainedSlop--; + wind--; + } + } + queries.Add(q); + } + } + } + return queries.ToArray(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/ByTask/Feeds/SingleDocSource.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/SingleDocSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/SingleDocSource.cs new file mode 100644 index 0000000..6b01faf --- /dev/null +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/SingleDocSource.cs @@ -0,0 +1,77 @@ +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Creates the same document each time <see cref="GetNextDocData(DocData)"/> is called. + /// </summary> + public class SingleDocSource : ContentSource + { + private int docID = 0; + + internal static readonly string DOC_TEXT = + "Well, this is just some plain text we use for creating the " + + "test documents. It used to be a text from an online collection " + + "devoted to first aid, but if there was there an (online) lawyers " + + "first aid collection with legal advices, \"it\" might have quite " + + "probably advised one not to include \"it\"'s text or the text of " + + "any other online collection in one's code, unless one has money " + + "that one don't need and one is happy to donate for lawyers " + + "charity. Anyhow at some point, rechecking the usage of this text, " + + "it became uncertain that this text is free to use, because " + + "the web site in the disclaimer of he eBook containing that text " + + "was not responding anymore, and at the same time, in projGut, " + + "searching for first aid no longer found that eBook as well. " + + "So here we are, with a perhaps much less interesting " + + "text for the test, but oh much much safer. "; + + // return a new docid + private int NewDocID() + { + lock (this) + { + if (docID > 0 && !m_forever) + { + throw new NoMoreDataException(); + } + return docID++; + } + } + + protected override void Dispose(bool disposing) { } + + public override DocData GetNextDocData(DocData docData) + { + int id = NewDocID(); + AddBytes(DOC_TEXT.Length); + docData.Clear(); + docData.Name = "doc" + id; + docData.Body = DOC_TEXT; + return docData; + } + + public override void ResetInputs() + { + lock (this) + { + base.ResetInputs(); + docID = 0; + } + } + } +}
