http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Trec/Trec1MQReader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/Trec1MQReader.cs b/src/Lucene.Net.Benchmark/Quality/Trec/Trec1MQReader.cs new file mode 100644 index 0000000..85dceda --- /dev/null +++ b/src/Lucene.Net.Benchmark/Quality/Trec/Trec1MQReader.cs @@ -0,0 +1,92 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace Lucene.Net.Benchmarks.Quality.Trec +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Read topics of TREC 1MQ track. + /// <para/> + /// Expects this topic format - + /// <code> + /// qnum:qtext + /// </code> + /// Comment lines starting with '#' are ignored. + /// <para/> + /// All topics will have a single name value pair. + /// </summary> + public class Trec1MQReader + { + private string name; + + /// <summary> + /// Constructor for Trec's 1MQ TopicsReader + /// </summary> + /// <param name="name">Name of name-value pair to set for all queries.</param> + public Trec1MQReader(string name) + : base() + { + this.name = name; + } + + /// <summary> + /// Read quality queries from trec 1MQ format topics file. + /// </summary> + /// <param name="reader">where queries are read from.</param> + /// <returns>the result quality queries.</returns> + /// <exception cref="IOException">if cannot read the queries.</exception> + public virtual QualityQuery[] ReadQueries(TextReader reader) + { + IList<QualityQuery> res = new List<QualityQuery>(); + string line; + try + { + while (null != (line = reader.ReadLine())) + { + line = line.Trim(); + if (line.StartsWith("#", StringComparison.Ordinal)) + { + continue; + } + // id + int k = line.IndexOf(':'); + string id = line.Substring(0, k - 0).Trim(); + // qtext + string qtext = line.Substring(k + 1).Trim(); + // we got a topic! + IDictionary<string, string> fields = new Dictionary<string, string>(); + fields[name] = qtext; + //System.out.println("id: "+id+" qtext: "+qtext+" line: "+line); + QualityQuery topic = new QualityQuery(id, fields); + res.Add(topic); + } + } + finally + { + reader.Dispose(); + } + // sort result array (by ID) + QualityQuery[] qq = res.ToArray(); + Array.Sort(qq); + return qq; + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Trec/TrecJudge.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/TrecJudge.cs b/src/Lucene.Net.Benchmark/Quality/Trec/TrecJudge.cs new file mode 100644 index 0000000..386b130 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Quality/Trec/TrecJudge.cs @@ -0,0 +1,186 @@ +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; + +namespace Lucene.Net.Benchmarks.Quality.Trec +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Judge if given document is relevant to given quality query, based on Trec format for judgements. + /// </summary> + public class TrecJudge : IJudge + { + IDictionary<string, QRelJudgement> judgements; + + /// <summary> + /// Constructor from a reader. + /// </summary> + /// <remarks> + /// Expected input format: + /// <code> + /// qnum 0 doc-name is-relevant + /// </code> + /// Two sample lines: + /// <code> + /// 19 0 doc303 1 + /// 19 0 doc7295 0 + /// </code> + /// </remarks> + /// <param name="reader">Where judgments are read from.</param> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + public TrecJudge(TextReader reader) + { + judgements = new Dictionary<string, QRelJudgement>(); + QRelJudgement curr = null; + string zero = "0"; + string line; + + try + { + while (null != (line = reader.ReadLine())) + { + line = line.Trim(); + if (line.Length == 0 || '#' == line[0]) + { + continue; + } + StringTokenizer st = new StringTokenizer(line); + string queryID = st.NextToken(); + st.NextToken(); + string docName = st.NextToken(); + bool relevant = !zero.Equals(st.NextToken(), StringComparison.Ordinal); + // LUCENENET: don't call st.NextToken() unless the condition fails. + Debug.Assert(!st.HasMoreTokens(), "wrong format: " + line + " next: " + (st.HasMoreTokens() ? st.NextToken() : "")); + if (relevant) + { // only keep relevant docs + if (curr == null || !curr.queryID.Equals(queryID, StringComparison.Ordinal)) + { + if (!judgements.TryGetValue(queryID, out curr) || curr == null) + { + curr = new QRelJudgement(queryID); + judgements[queryID] = curr; + } + } + curr.AddRelevantDoc(docName); + } + } + } + finally + { + reader.Dispose(); + } + } + + // inherit javadocs + public virtual bool IsRelevant(string docName, QualityQuery query) + { + QRelJudgement qrj;// = judgements.get(query.getQueryID()); + judgements.TryGetValue(query.QueryID, out qrj); + return qrj != null && qrj.IsRelevant(docName); + } + + /// <summary> + /// Single Judgement of a trec quality query. + /// </summary> + private class QRelJudgement + { + internal string queryID; + private IDictionary<string, string> relevantDocs; + + internal QRelJudgement(string queryID) + { + this.queryID = queryID; + relevantDocs = new HashMap<string, string>(); + } + + public virtual void AddRelevantDoc(string docName) + { + relevantDocs[docName] = docName; + } + + internal virtual bool IsRelevant(string docName) + { + return relevantDocs.ContainsKey(docName); + } + + public virtual int MaxRecall + { + get { return relevantDocs.Count; } + } + } + + // inherit javadocs + public virtual bool ValidateData(QualityQuery[] qq, TextWriter logger) + { + IDictionary<string, QRelJudgement> missingQueries = new Dictionary<string, QRelJudgement>(judgements); + IList<string> missingJudgements = new List<string>(); + for (int i = 0; i < qq.Length; i++) + { + string id = qq[i].QueryID; + if (missingQueries.ContainsKey(id)) + { + missingQueries.Remove(id); + } + else + { + missingJudgements.Add(id); + } + } + bool isValid = true; + if (missingJudgements.Count > 0) + { + isValid = false; + if (logger != null) + { + logger.WriteLine("WARNING: " + missingJudgements.Count + " queries have no judgments! - "); + for (int i = 0; i < missingJudgements.Count; i++) + { + logger.WriteLine(" " + missingJudgements[i]); + } + } + } + if (missingQueries.Count > 0) + { + isValid = false; + if (logger != null) + { + logger.WriteLine("WARNING: " + missingQueries.Count + " judgments match no query! - "); + foreach (string id in missingQueries.Keys) + { + logger.WriteLine(" " + id); + } + } + } + return isValid; + } + + // inherit javadocs + public virtual int MaxRecall(QualityQuery query) + { + QRelJudgement qrj; + if (judgements.TryGetValue(query.QueryID, out qrj) && qrj != null) + { + return qrj.MaxRecall; + } + return 0; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Trec/TrecTopicsReader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/TrecTopicsReader.cs b/src/Lucene.Net.Benchmark/Quality/Trec/TrecTopicsReader.cs new file mode 100644 index 0000000..158386f --- /dev/null +++ b/src/Lucene.Net.Benchmark/Quality/Trec/TrecTopicsReader.cs @@ -0,0 +1,154 @@ +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Benchmarks.Quality.Trec +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Read TREC topics. + /// </summary> + /// <remarks> + /// Expects this topic format - + /// <code> + /// <top> + /// <num> Number: nnn + /// + /// <title> title of the topic + /// + /// <desc> Description: + /// description of the topic + /// + /// <narr> Narrative: + /// "story" composed by assessors. + /// + /// </top> + /// </code> + /// Comment lines starting with '#' are ignored. + /// </remarks> + public class TrecTopicsReader + { + private static readonly string newline = Environment.NewLine; + + /// <summary> + /// Constructor for Trec's TopicsReader + /// </summary> + public TrecTopicsReader() + : base() + { + } + + /// <summary> + /// Read quality queries from trec format topics file. + /// </summary> + /// <param name="reader">where queries are read from.</param> + /// <returns>the result quality queries.</returns> + /// <exception cref="IOException">if cannot read the queries.</exception> + public virtual QualityQuery[] ReadQueries(TextReader reader) + { + IList<QualityQuery> res = new List<QualityQuery>(); + StringBuilder sb; + try + { + while (null != (sb = Read(reader, "<top>", null, false, false))) + { + IDictionary<string, string> fields = new Dictionary<string, string>(); + // id + sb = Read(reader, "<num>", null, true, false); + int k = sb.IndexOf(":"); + string id = sb.ToString(k + 1, sb.Length - (k + 1)).Trim(); + // title + sb = Read(reader, "<title>", null, true, false); + k = sb.IndexOf(">"); + string title = sb.ToString(k + 1, sb.Length - (k + 1)).Trim(); + // description + Read(reader, "<desc>", null, false, false); + sb.Length = 0; + string line = null; + while ((line = reader.ReadLine()) != null) + { + if (line.StartsWith("<narr>", StringComparison.Ordinal)) + break; + if (sb.Length > 0) sb.Append(' '); + sb.Append(line); + } + string description = sb.ToString().Trim(); + // narrative + sb.Length = 0; + while ((line = reader.ReadLine()) != null) + { + if (line.StartsWith("</top>", StringComparison.Ordinal)) + break; + if (sb.Length > 0) sb.Append(' '); + sb.Append(line); + } + string narrative = sb.ToString().Trim(); + // we got a topic! + fields["title"] = title; + fields["description"] = description; + fields["narrative"] = narrative; + QualityQuery topic = new QualityQuery(id, fields); + res.Add(topic); + } + } + finally + { + reader.Dispose(); + } + // sort result array (by ID) + QualityQuery[] qq = res.ToArray(); + Array.Sort(qq); + return qq; + } + + // read until finding a line that starts with the specified prefix + private StringBuilder Read(TextReader reader, string prefix, StringBuilder sb, bool collectMatchLine, bool collectAll) + { + sb = (sb == null ? new StringBuilder() : sb); + string sep = ""; + while (true) + { + string line = reader.ReadLine(); + if (line == null) + { + return null; + } + if (line.StartsWith(prefix, StringComparison.Ordinal)) + { + if (collectMatchLine) + { + sb.Append(sep + line); + sep = newline; + } + break; + } + if (collectAll) + { + sb.Append(sep + line); + sep = newline; + } + } + //System.out.println("read: "+sb); + return sb; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Utils/DocNameExtractor.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Quality/Utils/DocNameExtractor.cs b/src/Lucene.Net.Benchmark/Quality/Utils/DocNameExtractor.cs new file mode 100644 index 0000000..6e5cc0f --- /dev/null +++ b/src/Lucene.Net.Benchmark/Quality/Utils/DocNameExtractor.cs @@ -0,0 +1,89 @@ +using Lucene.Net.Index; +using Lucene.Net.Search; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Lucene.Net.Benchmarks.Quality.Utils +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Utility: extract doc names from an index + /// </summary> + public class DocNameExtractor + { + private readonly string docNameField; + + /// <summary> + /// Constructor for <see cref="DocNameExtractor"/>. + /// </summary> + /// <param name="docNameField">name of the stored field containing the doc name.</param> + public DocNameExtractor(string docNameField) + { + this.docNameField = docNameField; + } + + /// <summary> + /// Extract the name of the input doc from the index. + /// </summary> + /// <param name="searcher">access to the index.</param> + /// <param name="docid">ID of doc whose name is needed.</param> + /// <returns>the name of the input doc as extracted from the index.</returns> + /// <exception cref="System.IO.IOException">if cannot extract the doc name from the index.</exception> + public virtual string DocName(IndexSearcher searcher, int docid) + { + IList<string> name = new List<string>(); + searcher.IndexReader.Document(docid, new StoredFieldVisitorAnonymousHelper(this, name)); + + return name.FirstOrDefault(); + } + + private class StoredFieldVisitorAnonymousHelper : StoredFieldVisitor + { + private readonly DocNameExtractor outerInstance; + private readonly IList<string> name; + + public StoredFieldVisitorAnonymousHelper(DocNameExtractor outerInstance, IList<string> name) + { + this.outerInstance = outerInstance; + this.name = name; + } + public override void StringField(FieldInfo fieldInfo, string value) + { + name.Add(value); + } + + public override Status NeedsField(FieldInfo fieldInfo) + { + if (name.Count > 0) + { + return Status.STOP; + } + else if (fieldInfo.Name.Equals(outerInstance.docNameField, StringComparison.Ordinal)) + { + return Status.YES; + } + else + { + return Status.NO; + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Utils/QualityQueriesFinder.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Quality/Utils/QualityQueriesFinder.cs b/src/Lucene.Net.Benchmark/Quality/Utils/QualityQueriesFinder.cs new file mode 100644 index 0000000..062263a --- /dev/null +++ b/src/Lucene.Net.Benchmark/Quality/Utils/QualityQueriesFinder.cs @@ -0,0 +1,152 @@ +using Lucene.Net.Index; +using Lucene.Net.Store; +using Lucene.Net.Support; +using System; +using System.IO; + +namespace Lucene.Net.Benchmarks.Quality.Utils +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Suggest Quality queries based on an index contents. + /// Utility class, used for making quality test benchmarks. + /// </summary> + public class QualityQueriesFinder + { + private static readonly string newline = Environment.NewLine; + private Store.Directory dir; + + /// <summary> + /// Constructor over a directory containing the index. + /// </summary> + /// <param name="dir">Directory containing the index we search for the quality test.</param> + private QualityQueriesFinder(Store.Directory dir) + { + this.dir = dir; + } + + /// <summary> + /// + /// </summary> + /// <param name="args">{index-dir}</param> + /// <exception cref="IOException">if cannot access the index.</exception> + public static void Main(string[] args) + { + if (args.Length < 1) + { + SystemConsole.Error.WriteLine("Usage: java QualityQueriesFinder <index-dir>"); + Environment.Exit(1); + } + QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory.Open(new DirectoryInfo(args[0]))); + string[] q = qqf.BestQueries("body", 20); + for (int i = 0; i < q.Length; i++) + { + SystemConsole.WriteLine(newline + FormatQueryAsTrecTopic(i, q[i], null, null)); + } + } + + private string[] BestQueries(string field, int numQueries) + { + string[] words = BestTerms("body", 4 * numQueries); + int n = words.Length; + int m = n / 4; + string[] res = new string[m]; + for (int i = 0; i < res.Length; i++) + { + res[i] = words[i] + " " + words[m + i] + " " + words[n - 1 - m - i] + " " + words[n - 1 - i]; + //System.out.println("query["+i+"]: "+res[i]); + } + return res; + } + + private static string FormatQueryAsTrecTopic(int qnum, string title, string description, string narrative) + { + return + "<top>" + newline + + "<num> Number: " + qnum + newline + newline + + "<title> " + (title == null ? "" : title) + newline + newline + + "<desc> Description:" + newline + + (description == null ? "" : description) + newline + newline + + "<narr> Narrative:" + newline + + (narrative == null ? "" : narrative) + newline + newline + + "</top>"; + } + + private string[] BestTerms(string field, int numTerms) + { + Util.PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms); + IndexReader ir = DirectoryReader.Open(dir); + try + { + int threshold = ir.MaxDoc / 10; // ignore words too common. + Terms terms = MultiFields.GetTerms(ir, field); + if (terms != null) + { + TermsEnum termsEnum = terms.GetIterator(null); + while (termsEnum.Next() != null) + { + int df = termsEnum.DocFreq; + if (df < threshold) + { + string ttxt = termsEnum.Term.Utf8ToString(); + pq.InsertWithOverflow(new TermDf(ttxt, df)); + } + } + } + } + finally + { + ir.Dispose(); + } + string[] res = new string[pq.Count]; + int i = 0; + while (pq.Count > 0) + { + TermDf tdf = pq.Pop(); + res[i++] = tdf.word; + SystemConsole.WriteLine(i + ". word: " + tdf.df + " " + tdf.word); + } + return res; + } + + private class TermDf + { + internal string word; + internal int df; + internal TermDf(string word, int freq) + { + this.word = word; + this.df = freq; + } + } + + private class TermsDfQueue : Util.PriorityQueue<TermDf> + { + internal TermsDfQueue(int maxSize) + : base(maxSize) + { + } + + protected override bool LessThan(TermDf tf1, TermDf tf2) + { + return tf1.df < tf2.df; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Utils/SimpleQQParser.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Quality/Utils/SimpleQQParser.cs b/src/Lucene.Net.Benchmark/Quality/Utils/SimpleQQParser.cs new file mode 100644 index 0000000..0711e86 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Quality/Utils/SimpleQQParser.cs @@ -0,0 +1,76 @@ +using Lucene.Net.Analysis.Standard; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Search; +using Lucene.Net.Util; +using System.Threading; + +namespace Lucene.Net.Benchmarks.Quality.Utils +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simplistic quality query parser. A Lucene query is created by passing + /// the value of the specified <see cref="QualityQuery"/> name-value pair(s) into + /// a Lucene's <see cref="QueryParser"/> using <see cref="StandardAnalyzer"/>. + /// </summary> + public class SimpleQQParser : IQualityQueryParser + { + private string[] qqNames; + private string indexField; + ThreadLocal<QueryParser> queryParser = new ThreadLocal<QueryParser>(); + + /// <summary> + /// Constructor of a simple qq parser. + /// </summary> + /// <param name="qqNames">Name-value pairs of quality query to use for creating the query.</param> + /// <param name="indexField">Corresponding index field.</param> + public SimpleQQParser(string[] qqNames, string indexField) + { + this.qqNames = qqNames; + this.indexField = indexField; + } + + /// <summary> + /// Constructor of a simple qq parser. + /// </summary> + /// <param name="qqName">Name-value pair of quality query to use for creating the query.</param> + /// <param name="indexField">Corresponding index field.</param> + public SimpleQQParser(string qqName, string indexField) + : this(new string[] { qqName }, indexField) + { + } + + /// <seealso cref="IQualityQueryParser.Parse(QualityQuery)"/> + public virtual Query Parse(QualityQuery qq) + { + QueryParser qp = queryParser.Value; + if (qp == null) + { +#pragma warning disable 612, 618 + qp = new QueryParser(LuceneVersion.LUCENE_CURRENT, indexField, new StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)); +#pragma warning restore 612, 618 + queryParser.Value = qp; + } + BooleanQuery bq = new BooleanQuery(); + for (int i = 0; i < qqNames.Length; i++) + bq.Add(qp.Parse(QueryParserBase.Escape(qq.GetValue(qqNames[i]))), Occur.SHOULD); + + return bq; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Quality/Utils/SubmissionReport.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Quality/Utils/SubmissionReport.cs b/src/Lucene.Net.Benchmark/Quality/Utils/SubmissionReport.cs new file mode 100644 index 0000000..c31eddc --- /dev/null +++ b/src/Lucene.Net.Benchmark/Quality/Utils/SubmissionReport.cs @@ -0,0 +1,98 @@ +using Lucene.Net.Search; +using System; +using System.Globalization; +using System.IO; + +namespace Lucene.Net.Benchmarks.Quality.Utils +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Create a log ready for submission. + /// Extend this class and override + /// <see cref="Report(QualityQuery, TopDocs, string, IndexSearcher)"/> + /// to create different reports. + /// </summary> + public class SubmissionReport + { + //private NumberFormat nf; + private string nf; + private TextWriter logger; + private string name; + + /// <summary> + /// Constructor for <see cref="SubmissionReport"/>. + /// </summary> + /// <param name="logger">If <c>null</c>, no submission data is created.</param> + /// <param name="name">Name of this run.</param> + public SubmissionReport(TextWriter logger, string name) + { + this.logger = logger; + this.name = name; + nf = "{0:F4}"; + } + + /// <summary> + /// Report a search result for a certain quality query. + /// </summary> + /// <param name="qq">quality query for which the results are reported.</param> + /// <param name="td">search results for the query.</param> + /// <param name="docNameField">stored field used for fetching the result doc name.</param> + /// <param name="searcher">index access for fetching doc name.</param> + /// <see cref="IOException">in case of a problem.</see> + public virtual void Report(QualityQuery qq, TopDocs td, string docNameField, IndexSearcher searcher) + { + if (logger == null) + { + return; + } + ScoreDoc[] sd = td.ScoreDocs; + string sep = " \t "; + DocNameExtractor xt = new DocNameExtractor(docNameField); + for (int i = 0; i < sd.Length; i++) + { + string docName = xt.DocName(searcher, sd[i].Doc); + logger.WriteLine( + qq.QueryID + sep + + "Q0" + sep + + Format(docName, 20) + sep + + Format("" + i, 7) + sep + + //nf.format(sd[i].score) + sep + + string.Format(nf, sd[i].Score, CultureInfo.InvariantCulture) + sep + + name + ); + } + } + + public virtual void Flush() + { + if (logger != null) + { + logger.Flush(); + } + } + + private static string padd = " "; + private string Format(string s, int minLen) + { + s = (s == null ? "" : s); + int n = Math.Max(minLen, s.Length); + return (s + padd).Substring(0, n - 0); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs b/src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs new file mode 100644 index 0000000..8727fa0 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs @@ -0,0 +1,167 @@ +using Lucene.Net.Support; +using System; +using System.IO; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Benchmarks.Utils +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body + /// </summary> + public class ExtractReuters + { + private DirectoryInfo reutersDir; + private DirectoryInfo outputDir; + private static readonly string LINE_SEPARATOR = Environment.NewLine; + + public ExtractReuters(DirectoryInfo reutersDir, DirectoryInfo outputDir) + { + this.reutersDir = reutersDir; + this.outputDir = outputDir; + SystemConsole.WriteLine("Deleting all files in " + outputDir); + foreach (FileInfo f in outputDir.EnumerateFiles()) + { + f.Delete(); + } + } + + public virtual void Extract() + { + FileInfo[] sgmFiles = reutersDir.GetFiles("*.sgm"); + if (sgmFiles != null && sgmFiles.Length > 0) + { + foreach (FileInfo sgmFile in sgmFiles) + { + ExtractFile(sgmFile); + } + } + else + { + SystemConsole.Error.WriteLine("No .sgm files in " + reutersDir); + } + } + + internal Regex EXTRACTION_PATTERN = new Regex("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>", RegexOptions.Compiled); + + private static string[] META_CHARS = { "&", "<", ">", "\"", "'" }; + + private static string[] META_CHARS_SERIALIZATIONS = { "&", "<", + ">", """, "'" }; + + /// <summary> + /// Override if you wish to change what is extracted + /// </summary> + protected virtual void ExtractFile(FileInfo sgmFile) + { + try + { + using (TextReader reader = new StreamReader(new FileStream(sgmFile.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8)) + { + StringBuilder buffer = new StringBuilder(1024); + StringBuilder outBuffer = new StringBuilder(1024); + + string line = null; + int docNumber = 0; + while ((line = reader.ReadLine()) != null) + { + // when we see a closing reuters tag, flush the file + + if (line.IndexOf("</REUTERS") == -1) + { + // Replace the SGM escape sequences + + buffer.Append(line).Append(' ');// accumulate the strings for now, + // then apply regular expression to + // get the pieces, + } + else + { + // Extract the relevant pieces and write to a file in the output dir + Match matcher = EXTRACTION_PATTERN.Match(buffer.ToString()); + if (matcher.Success) + { + do + { + for (int i = 1; i <= matcher.Groups.Count; i++) + { + if (matcher.Groups[i] != null) + { + outBuffer.Append(matcher.Groups[i].Value); + } + } + outBuffer.Append(LINE_SEPARATOR).Append(LINE_SEPARATOR); + } while ((matcher = matcher.NextMatch()).Success); + } + + string @out = outBuffer.ToString(); + for (int i = 0; i < META_CHARS_SERIALIZATIONS.Length; i++) + { + @out = @out.Replace(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]); + } + string outFile = System.IO.Path.Combine(outputDir.FullName, sgmFile.Name + "-" + + (docNumber++) + ".txt"); + // System.out.println("Writing " + outFile); + StreamWriter writer = new StreamWriter(new FileStream(outFile, FileMode.Create, FileAccess.Write), Encoding.UTF8); + writer.Write(@out); + writer.Dispose(); + outBuffer.Length = 0; + buffer.Length = 0; + } + } + } + } + catch (IOException e) + { + throw new Exception(e.ToString(), e); + } + } + + public static void Main(string[] args) + { + if (args.Length != 2) + { + Usage("Wrong number of arguments (" + args.Length + ")"); + return; + } + DirectoryInfo reutersDir = new DirectoryInfo(args[0]); + if (!reutersDir.Exists) + { + Usage("Cannot find Path to Reuters SGM files (" + reutersDir + ")"); + return; + } + + // First, extract to a tmp directory and only if everything succeeds, rename + // to output directory. + DirectoryInfo outputDir = new DirectoryInfo(args[1]); + outputDir = new DirectoryInfo(outputDir.FullName + "-tmp"); + outputDir.Create(); + ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir); + extractor.Extract(); + // Now rename to requested output dir + outputDir.MoveTo(args[1]); + } + + private static void Usage(string msg) + { + SystemConsole.Error.WriteLine("Usage: " + msg + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>"); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs b/src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs new file mode 100644 index 0000000..b61fbc5 --- /dev/null +++ b/src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs @@ -0,0 +1,178 @@ +using Lucene.Net.Benchmarks.ByTask.Feeds; +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Documents; +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.Utils +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Extract the downloaded Wikipedia dump into separate files for indexing. + /// </summary> + public class ExtractWikipedia + { + private DirectoryInfo outputDir; + + public static int count = 0; + + internal static readonly int BASE = 10; + protected DocMaker m_docMaker; + + public ExtractWikipedia(DocMaker docMaker, DirectoryInfo outputDir) + { + this.outputDir = outputDir; + this.m_docMaker = docMaker; + SystemConsole.WriteLine("Deleting all files in " + outputDir); + FileInfo[] files = outputDir.GetFiles(); + for (int i = 0; i < files.Length; i++) + { + files[i].Delete(); + } + } + + public virtual DirectoryInfo Directory(int count, DirectoryInfo directory) + { + if (directory == null) + { + directory = outputDir; + } + int @base = BASE; + while (@base <= count) + { + @base *= BASE; + } + if (count < BASE) + { + return directory; + } + directory = new DirectoryInfo(System.IO.Path.Combine(directory.FullName, (((int)(@base / BASE)).ToString(CultureInfo.InvariantCulture)))); + directory = new DirectoryInfo(System.IO.Path.Combine(directory.FullName, (((int)(count / (@base / BASE))).ToString(CultureInfo.InvariantCulture)))); + return Directory(count % (@base / BASE), directory); + } + + public virtual void Create(string id, string title, string time, string body) + { + DirectoryInfo d = Directory(count++, null); + d.Create(); + FileInfo f = new FileInfo(System.IO.Path.Combine(d.FullName, id + ".txt")); + + StringBuilder contents = new StringBuilder(); + + contents.Append(time); + contents.Append("\n\n"); + contents.Append(title); + contents.Append("\n\n"); + contents.Append(body); + contents.Append("\n"); + + try + { + using (TextWriter writer = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8)) + writer.Write(contents.ToString()); + } + catch (IOException ioe) + { + throw new Exception(ioe.ToString(), ioe); + } + } + + public virtual void Extract() + { + Document doc = null; + SystemConsole.WriteLine("Starting Extraction"); + long start = Support.Time.CurrentTimeMilliseconds(); + try + { + while ((doc = m_docMaker.MakeDocument()) != null) + { + Create(doc.Get(DocMaker.ID_FIELD), doc.Get(DocMaker.TITLE_FIELD), doc + .Get(DocMaker.DATE_FIELD), doc.Get(DocMaker.BODY_FIELD)); + } + } + catch (NoMoreDataException /*e*/) + { + //continue + } + long finish = Support.Time.CurrentTimeMilliseconds(); + SystemConsole.WriteLine("Extraction took " + (finish - start) + " ms"); + } + + public static void Main(string[] args) + { + + FileInfo wikipedia = null; + DirectoryInfo outputDir = new DirectoryInfo("./enwiki"); + bool keepImageOnlyDocs = true; + for (int i = 0; i < args.Length; i++) + { + string arg = args[i]; + if (arg.Equals("--input", StringComparison.Ordinal) || arg.Equals("-i", StringComparison.Ordinal)) + { + wikipedia = new FileInfo(args[i + 1]); + i++; + } + else if (arg.Equals("--output", StringComparison.Ordinal) || arg.Equals("-o", StringComparison.Ordinal)) + { + outputDir = new DirectoryInfo(args[i + 1]); + i++; + } + else if (arg.Equals("--discardImageOnlyDocs", StringComparison.Ordinal) || arg.Equals("-d", StringComparison.Ordinal)) + { + keepImageOnlyDocs = false; + } + } + + IDictionary<string, string> properties = new Dictionary<string, string>(); + properties["docs.file"] = wikipedia.FullName; + properties["content.source.forever"] = "false"; + properties["keep.image.only.docs"] = keepImageOnlyDocs.ToString(); + Config config = new Config(properties); + + ContentSource source = new EnwikiContentSource(); + source.SetConfig(config); + + DocMaker docMaker = new DocMaker(); + docMaker.SetConfig(config, source); + docMaker.ResetInputs(); + if (wikipedia.Exists) + { + SystemConsole.WriteLine("Extracting Wikipedia to: " + outputDir + " using EnwikiContentSource"); + outputDir.Create(); + ExtractWikipedia extractor = new ExtractWikipedia(docMaker, outputDir); + extractor.Extract(); + } + else + { + PrintUsage(); + } + } + + private static void PrintUsage() + { + SystemConsole.Error.WriteLine("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia --input|-i <Path to Wikipedia XML file> " + + "[--output|-o <Output Path>] [--discardImageOnlyDocs|-d]"); + SystemConsole.Error.WriteLine("--discardImageOnlyDocs tells the extractor to skip Wiki docs that contain only images"); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Benchmark/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Benchmark/project.json b/src/Lucene.Net.Benchmark/project.json new file mode 100644 index 0000000..adac6d5 --- /dev/null +++ b/src/Lucene.Net.Benchmark/project.json @@ -0,0 +1,53 @@ +{ + "version": "4.8.0", + "title": "Lucene.Net.Benchmark", + "description": "System for benchmarking the Lucene.Net full-text search engine library from The Apache Software Foundation.", + "authors": [ "The Apache Software Foundation" ], + "packOptions": { + "projectUrl": "http://lucenenet.apache.org/", + "licenseUrl": "https://github.com/apache/lucenenet/blob/master/LICENSE.txt", + "iconUrl": "https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true", + "owners": [ "The Apache Software Foundation" ], + "repository": { "url": "https://github.com/apache/lucenenet" }, + "tags": [ "lucene.net", "core", "text", "search", "information", "retrieval", "lucene", "apache", "analysis", "index", "query" ] + }, + "buildOptions": { + "compile": { + "includeFiles": [ "../CommonAssemblyInfo.cs" ] + }, + "nowarn": [ "1591", "1573" ] + }, + "dependencies": { + "icu.net": "54.1.1-alpha", + "Lucene.Net": "4.8.0", + "Lucene.Net.Analysis.Common": "4.8.0", + "Lucene.Net.Facet": "4.8.0", + "Lucene.Net.Highlighter": "4.8.0", + "Lucene.Net.ICU": "4.8.0", + "Lucene.Net.Queries": "4.8.0", + "Lucene.Net.QueryParser": "4.8.0", + "Lucene.Net.Spatial": "4.8.0", + "Sax.Net": "2.0.2", + "SharpZipLib": "0.86.0", + "Spatial4n.Core": "0.4.1-beta00003", + "TagSoup.Net": "1.2.1.1" + }, + "frameworks": { + "netstandard1.5": { + "imports": "dnxcore50", + "buildOptions": { + "debugType": "portable", + "define": [ "NETSTANDARD" ] + }, + "dependencies": { + "NETStandard.Library": "1.6.0" + } + }, + "net451": { + "buildOptions": { + "debugType": "full", + "define": [ "FEATURE_SERIALIZABLE" ] + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.TestFramework/Util/TestUtil.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.TestFramework/Util/TestUtil.cs b/src/Lucene.Net.TestFramework/Util/TestUtil.cs index e7eb247..bfc73dd 100644 --- a/src/Lucene.Net.TestFramework/Util/TestUtil.cs +++ b/src/Lucene.Net.TestFramework/Util/TestUtil.cs @@ -148,11 +148,20 @@ namespace Lucene.Net.Util { foreach (var entry in zip.Entries) { + // Ignore internal folders - these are tacked onto the FullName anyway + if (entry.FullName.EndsWith("/", StringComparison.Ordinal) || entry.FullName.EndsWith("\\", StringComparison.Ordinal)) + { + continue; + } using (Stream input = entry.Open()) { - FileInfo targetFile = new FileInfo(Path.Combine(destDir.FullName, entry.FullName)); + FileInfo targetFile = new FileInfo(CorrectPath(Path.Combine(destDir.FullName, entry.FullName))); + if (!targetFile.Directory.Exists) + { + targetFile.Directory.Create(); + } - using (Stream output = new FileStream(targetFile.FullName, FileMode.OpenOrCreate, FileAccess.Write)) + using (Stream output = new FileStream(targetFile.FullName, FileMode.Create, FileAccess.Write)) { input.CopyTo(output); } @@ -161,6 +170,15 @@ namespace Lucene.Net.Util } } + private static string CorrectPath(string input) + { + if (Path.DirectorySeparatorChar.Equals('/')) + { + return input.Replace('\\', '/'); + } + return input.Replace('/', '\\'); + } + public static void SyncConcurrentMerges(IndexWriter writer) { SyncConcurrentMerges(writer.Config.MergeScheduler); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Tests.Benchmark/BenchmarkTestCase.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Benchmark/BenchmarkTestCase.cs b/src/Lucene.Net.Tests.Benchmark/BenchmarkTestCase.cs new file mode 100644 index 0000000..8981ee0 --- /dev/null +++ b/src/Lucene.Net.Tests.Benchmark/BenchmarkTestCase.cs @@ -0,0 +1,129 @@ +using Lucene.Net.Benchmarks.ByTask; +using Lucene.Net.Util; +using System; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Base class for all Benchmark unit tests. + /// </summary> + public abstract class BenchmarkTestCase : LuceneTestCase + { + private static DirectoryInfo WORKDIR; + + public override void BeforeClass() + { + base.BeforeClass(); + WORKDIR = CreateTempDir("benchmark"); + // LUCENENET: Our directory numbers are sequential. Doing a delete + // here will make threads collide. + //WORKDIR.Delete(); + //WORKDIR.Create(); + + propLines = new string[] { + "work.dir=" + getWorkDirPath(), + "directory=RAMDirectory", + "print.props=false", + }; + } + + public override void AfterClass() + { + WORKDIR = null; + base.AfterClass(); + } + + + public DirectoryInfo getWorkDir() + { + return WORKDIR; + } + + /** Copy a resource into the workdir */ + public void copyToWorkDir(string resourceName) + { + Stream resource = GetType().getResourceAsStream(resourceName); + Stream dest = new FileStream(System.IO.Path.Combine(getWorkDir().FullName, resourceName), FileMode.Create, FileAccess.Write); + byte[] buffer = new byte[8192]; + int len; + + while ((len = resource.Read(buffer, 0, buffer.Length)) > 0) + { + dest.Write(buffer, 0, len); + } + + resource.Dispose(); + dest.Dispose(); + } + + /** Return a path, suitable for a .alg config file, for a resource in the workdir */ + public String getWorkDirResourcePath(String resourceName) + { + return System.IO.Path.Combine(getWorkDir().FullName, resourceName).Replace("\\", "/"); + } + + /** Return a path, suitable for a .alg config file, for the workdir */ + public String getWorkDirPath() + { + return getWorkDir().FullName.Replace("\\", "/"); + } + + // create the benchmark and execute it. + public Benchmark execBenchmark(String[] algLines) + { + String algText = algLinesToText(algLines); + logTstLogic(algText); + Benchmark benchmark = new Benchmark(new StringReader(algText)); + benchmark.Execute(); + return benchmark; + } + + // properties in effect in all tests here + String[] propLines; + + static readonly String NEW_LINE = Environment.NewLine; + + // catenate alg lines to make the alg text + private String algLinesToText(String[] algLines) + { + String indent = " "; + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < propLines.Length; i++) + { + sb.append(indent).append(propLines[i]).append(NEW_LINE); + } + for (int i = 0; i < algLines.Length; i++) + { + sb.append(indent).append(algLines[i]).append(NEW_LINE); + } + return sb.toString(); + } + + private static void logTstLogic(String txt) + { + if (!VERBOSE) + return; + Console.WriteLine("Test logic of:"); + Console.WriteLine(txt); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs new file mode 100644 index 0000000..301c807 --- /dev/null +++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs @@ -0,0 +1,193 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Documents; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Support; +using NUnit.Framework; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests the functionality of {@link DocMaker}. + /// </summary> + public class DocMakerTest : BenchmarkTestCase + { + public sealed class OneDocSource : ContentSource + { + private bool finish = false; + + protected override void Dispose(bool disposing) + { + } + + public override DocData GetNextDocData(DocData docData) + { + if (finish) + { + throw new NoMoreDataException(); + } + + docData.Body = ("body"); + docData.SetDate("date"); + docData.Title = ("title"); + Dictionary<string, string> props = new Dictionary<string, string>(); + props["key"] = "value"; + docData.Props = props; + finish = true; + + return docData; + } + } + + private void doTestIndexProperties(bool setIndexProps, + bool indexPropsVal, int numExpectedResults) + { + Dictionary<string, string> props = new Dictionary<string, string>(); + + // Indexing configuration. + props["analyzer"] = typeof(WhitespaceAnalyzer).AssemblyQualifiedName; + props["content.source"] = typeof(OneDocSource).AssemblyQualifiedName; + props["directory"] = "RAMDirectory"; + if (setIndexProps) + { + props["doc.index.props"] = indexPropsVal.ToString(); + } + + // Create PerfRunData + Config config = new Config(props); + PerfRunData runData = new PerfRunData(config); + + TaskSequence tasks = new TaskSequence(runData, TestName, null, false); + tasks.AddTask(new CreateIndexTask(runData)); + tasks.AddTask(new AddDocTask(runData)); + tasks.AddTask(new CloseIndexTask(runData)); + tasks.DoLogic(); + + IndexReader reader = DirectoryReader.Open(runData.Directory); + IndexSearcher searcher = NewSearcher(reader); + TopDocs td = searcher.Search(new TermQuery(new Term("key", "value")), 10); + assertEquals(numExpectedResults, td.TotalHits); + reader.Dispose(); + } + + private Document createTestNormsDocument(bool setNormsProp, + bool normsPropVal, bool setBodyNormsProp, bool bodyNormsVal) + { + Dictionary<string, string> props = new Dictionary<string, string>(); + + // Indexing configuration. + props["analyzer"] = typeof(WhitespaceAnalyzer).AssemblyQualifiedName; + props["directory"] = "RAMDirectory"; + if (setNormsProp) + { + props["doc.tokenized.norms"] = normsPropVal.ToString(CultureInfo.InvariantCulture); + } + if (setBodyNormsProp) + { + props["doc.body.tokenized.norms"] = bodyNormsVal.ToString(CultureInfo.InvariantCulture); + } + + // Create PerfRunData + Config config = new Config(props); + + DocMaker dm = new DocMaker(); + dm.SetConfig(config, new OneDocSource()); + return dm.MakeDocument(); + } + + /* Tests doc.index.props property. */ + [Test] + public void TestIndexProperties() + { + // default is to not index properties. + doTestIndexProperties(false, false, 0); + + // set doc.index.props to false. + doTestIndexProperties(true, false, 0); + + // set doc.index.props to true. + doTestIndexProperties(true, true, 1); + } + + /* Tests doc.tokenized.norms and doc.body.tokenized.norms properties. */ + [Test] + public void TestNorms() + { + + Document doc; + + // Don't set anything, use the defaults + doc = createTestNormsDocument(false, false, false, false); + assertTrue(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms); + assertFalse(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms); + + // Set norms to false + doc = createTestNormsDocument(true, false, false, false); + assertTrue(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms); + assertFalse(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms); + + // Set norms to true + doc = createTestNormsDocument(true, true, false, false); + assertFalse(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms); + assertFalse(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms); + + // Set body norms to false + doc = createTestNormsDocument(false, false, true, false); + assertTrue(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms); + assertTrue(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms); + + // Set body norms to true + doc = createTestNormsDocument(false, false, true, true); + assertTrue(doc.GetField(DocMaker.TITLE_FIELD).FieldType.OmitNorms); + assertFalse(doc.GetField(DocMaker.BODY_FIELD).FieldType.OmitNorms); + } + + [Test] + public void TestDocMakerLeak() + { + // DocMaker did not close its ContentSource if resetInputs was called twice, + // leading to a file handle leak. + FileInfo f = new FileInfo(Path.Combine(getWorkDir().FullName, "docMakerLeak.txt")); + TextWriter ps = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8); + ps.WriteLine("one title\t" + Time.CurrentTimeMilliseconds() + "\tsome content"); + ps.Dispose(); + + Dictionary<string, string> props = new Dictionary<string, string>(); + props["docs.file"] = f.FullName; + props["content.source.forever"] = "false"; + Config config = new Config(props); + + ContentSource source = new LineDocSource(); + source.SetConfig(config); + + DocMaker dm = new DocMaker(); + dm.SetConfig(config, source); + dm.ResetInputs(); + dm.ResetInputs(); + dm.Dispose(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/EnwikiContentSourceTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/EnwikiContentSourceTest.cs b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/EnwikiContentSourceTest.cs new file mode 100644 index 0000000..95ded38 --- /dev/null +++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/EnwikiContentSourceTest.cs @@ -0,0 +1,194 @@ +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + [Ignore("LUCENENET TODO: Never finishes")] + public class EnwikiContentSourceTest : LuceneTestCase + { + /** An EnwikiContentSource which works on a String and not files. */ + private class StringableEnwikiSource : EnwikiContentSource + { + + + private readonly String docs; + + public StringableEnwikiSource(String docs) + { + this.docs = docs; + } + + protected override Stream OpenInputStream() + { + return new MemoryStream(Encoding.UTF8.GetBytes(docs)); + } + + } + + private void assertDocData(DocData dd, String expName, String expTitle, String expBody, String expDate) + { + assertNotNull(dd); + assertEquals(expName, dd.Name); + assertEquals(expTitle, dd.Title); + assertEquals(expBody, dd.Body); + assertEquals(expDate, dd.Date); + } + + private void assertNoMoreDataException(EnwikiContentSource stdm) + { + try + { + stdm.GetNextDocData(null); + fail("Expecting NoMoreDataException"); + } +#pragma warning disable 168 + catch (NoMoreDataException e) +#pragma warning restore 168 + { + // expected + } + } + + private readonly String PAGE1 = + " <page>\r\n" + + " <title>Title1</title>\r\n" + + " <ns>0</ns>\r\n" + + " <id>1</id>\r\n" + + " <revision>\r\n" + + " <id>11</id>\r\n" + + " <parentid>111</parentid>\r\n" + + " <timestamp>2011-09-14T11:35:09Z</timestamp>\r\n" + + " <contributor>\r\n" + + " <username>Mister1111</username>\r\n" + + " <id>1111</id>\r\n" + + " </contributor>\r\n" + + " <minor />\r\n" + + " <comment>/* Never mind */</comment>\r\n" + + " <text>Some text 1 here</text>\r\n" + + " </revision>\r\n" + + " </page>\r\n"; + + private readonly String PAGE2 = + " <page>\r\n" + + " <title>Title2</title>\r\n" + + " <ns>0</ns>\r\n" + + " <id>2</id>\r\n" + + " <revision>\r\n" + + " <id>22</id>\r\n" + + " <parentid>222</parentid>\r\n" + + " <timestamp>2022-09-14T22:35:09Z</timestamp>\r\n" + + " <contributor>\r\n" + + " <username>Mister2222</username>\r\n" + + " <id>2222</id>\r\n" + + " </contributor>\r\n" + + " <minor />\r\n" + + " <comment>/* Never mind */</comment>\r\n" + + " <text>Some text 2 here</text>\r\n" + + " </revision>\r\n" + + " </page>\r\n"; + + [Test] + public void TestOneDocument() + { + String docs = + "<mediawiki>\r\n" + + PAGE1 + + "</mediawiki>"; + + EnwikiContentSource source = createContentSource(docs, false); + + DocData dd = source.GetNextDocData(new DocData()); + assertDocData(dd, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000"); + + + assertNoMoreDataException(source); + } + + private EnwikiContentSource createContentSource(String docs, bool forever) + { + + Dictionary<string, string> props = new Dictionary<string, string>(); + props["print.props"] = "false"; + props["content.source.forever"] = forever.ToString(CultureInfo.InvariantCulture); + Config config = new Config(props); + + EnwikiContentSource source = new StringableEnwikiSource(docs); + source.SetConfig(config); + + // doc-maker just for initiating content source inputs + DocMaker docMaker = new DocMaker(); + docMaker.SetConfig(config, source); + docMaker.ResetInputs(); + return source; + } + + [Test] + public void TestTwoDocuments() + { + String docs = + "<mediawiki>\r\n" + + PAGE1 + + PAGE2 + + "</mediawiki>"; + + EnwikiContentSource source = createContentSource(docs, false); + + DocData dd1 = source.GetNextDocData(new DocData()); + assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000"); + + DocData dd2 = source.GetNextDocData(new DocData()); + assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000"); + + + assertNoMoreDataException(source); + } + + [Test] + public void TestForever() + { + String docs = + "<mediawiki>\r\n" + + PAGE1 + + PAGE2 + + "</mediawiki>"; + + EnwikiContentSource source = createContentSource(docs, true); + + // same documents several times + for (int i = 0; i < 3; i++) + { + DocData dd1 = source.GetNextDocData(new DocData()); + assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000"); + + DocData dd2 = source.GetNextDocData(new DocData()); + assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000"); + // Don't test that NoMoreDataException is thrown, since the forever flag is turned on. + } + + source.Dispose(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b515271d/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs new file mode 100644 index 0000000..7cd27f1 --- /dev/null +++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs @@ -0,0 +1,271 @@ +using ICSharpCode.SharpZipLib.BZip2; +using Lucene.Net.Analysis.Core; +using Lucene.Net.Benchmarks.ByTask.Tasks; +using Lucene.Net.Benchmarks.ByTask.Utils; +using Lucene.Net.Index; +using Lucene.Net.Search; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Benchmarks.ByTask.Feeds +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests the functionality of {@link LineDocSource}. + /// </summary> + public class LineDocSourceTest : BenchmarkTestCase + { + //private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); + + private void createBZ2LineFile(FileInfo file, bool addHeader) + { + Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write); + @out = new BZip2OutputStream(@out); // csFactory.createCompressorOutputStream("bzip2", @out); + TextWriter writer = new StreamWriter(@out, Encoding.UTF8); + writeDocsToFile(writer, addHeader, null); + writer.Dispose(); + } + + private void writeDocsToFile(TextWriter writer, bool addHeader, IDictionary<string, string> otherFields) + { + if (addHeader) + { + writer.Write(WriteLineDocTask.FIELDS_HEADER_INDICATOR); + writer.Write(WriteLineDocTask.SEP); + writer.Write(DocMaker.TITLE_FIELD); + writer.Write(WriteLineDocTask.SEP); + writer.Write(DocMaker.DATE_FIELD); + writer.Write(WriteLineDocTask.SEP); + writer.Write(DocMaker.BODY_FIELD); + if (otherFields != null) + { + // additional field names in the header + foreach (Object fn in otherFields.Keys) + { + writer.Write(WriteLineDocTask.SEP); + writer.Write(fn.toString()); + } + } + writer.WriteLine(); + } + StringBuilder doc = new StringBuilder(); + doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD); + if (otherFields != null) + { + // additional field values in the doc line + foreach (Object fv in otherFields.Values) + { + doc.append(WriteLineDocTask.SEP).append(fv.toString()); + } + } + writer.Write(doc.toString()); + writer.WriteLine(); + } + + private void createRegularLineFile(FileInfo file, bool addHeader) + { + Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write); + TextWriter writer = new StreamWriter(@out, Encoding.UTF8); + writeDocsToFile(writer, addHeader, null); + writer.Dispose(); + } + + private void createRegularLineFileWithMoreFields(FileInfo file, params String[] extraFields) + { + Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write); + TextWriter writer = new StreamWriter(@out, Encoding.UTF8); + Dictionary<string, string> p = new Dictionary<string, string>(); + foreach (String f in extraFields) + { + p[f] = f; + } + writeDocsToFile(writer, true, p); + writer.Dispose(); + } + + private void doIndexAndSearchTest(FileInfo file, Type lineParserClass, String storedField) + { + doIndexAndSearchTestWithRepeats(file, lineParserClass, 1, storedField); // no extra repetitions + doIndexAndSearchTestWithRepeats(file, lineParserClass, 2, storedField); // 1 extra repetition + doIndexAndSearchTestWithRepeats(file, lineParserClass, 4, storedField); // 3 extra repetitions + } + + private void doIndexAndSearchTestWithRepeats(FileInfo file, + Type lineParserClass, int numAdds, String storedField) + { + + IndexReader reader = null; + IndexSearcher searcher = null; + PerfRunData runData = null; + try + { + Dictionary<string, string> props = new Dictionary<string, string>(); + + // LineDocSource specific settings. + props["docs.file"] = file.FullName; + if (lineParserClass != null) + { + props["line.parser"] = lineParserClass.AssemblyQualifiedName; + } + + // Indexing configuration. + props["analyzer"] = typeof(WhitespaceAnalyzer).AssemblyQualifiedName; + props["content.source"] = typeof(LineDocSource).AssemblyQualifiedName; + props["directory"] = "RAMDirectory"; + props["doc.stored"] = "true"; + props["doc.index.props"] = "true"; + + // Create PerfRunData + Config config = new Config(props); + runData = new PerfRunData(config); + + TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false); + tasks.AddTask(new CreateIndexTask(runData)); + for (int i = 0; i < numAdds; i++) + { + tasks.AddTask(new AddDocTask(runData)); + } + tasks.AddTask(new CloseIndexTask(runData)); + try + { + tasks.DoLogic(); + } + finally + { + tasks.Dispose(); + } + + reader = DirectoryReader.Open(runData.Directory); + searcher = NewSearcher(reader); + TopDocs td = searcher.Search(new TermQuery(new Term("body", "body")), 10); + assertEquals(numAdds, td.TotalHits); + assertNotNull(td.ScoreDocs[0]); + + if (storedField == null) + { + storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value + } + assertEquals("Wrong field value", storedField, searcher.Doc(0).Get(storedField)); + } + finally + { + IOUtils.Dispose(reader, runData); + } + + } + + /* Tests LineDocSource with a bzip2 input stream. */ + [Test] + public void TestBZip2() + { + FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line.bz2")); + createBZ2LineFile(file, true); + doIndexAndSearchTest(file, null, null); + } + + [Test] + public void TestBZip2NoHeaderLine() + { + FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line.bz2")); + createBZ2LineFile(file, false); + doIndexAndSearchTest(file, null, null); + } + + [Test] + public void TestRegularFile() + { + FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line")); + createRegularLineFile(file, true); + doIndexAndSearchTest(file, null, null); + } + + [Test] + public void TestRegularFileSpecialHeader() + { + FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line")); + createRegularLineFile(file, true); + doIndexAndSearchTest(file, typeof(HeaderLineParser), null); + } + + [Test] + public void TestRegularFileNoHeaderLine() + { + FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line")); + createRegularLineFile(file, false); + doIndexAndSearchTest(file, null, null); + } + + [Test] + public void TestInvalidFormat() + { + String[] + testCases = new String[] { + "", // empty line + "title", // just title + "title" + WriteLineDocTask.SEP, // title + SEP + "title" + WriteLineDocTask.SEP + "body", // title + SEP + body + // note that title + SEP + body + SEP is a valid line, which results in an + // empty body + }; + + for (int i = 0; i < testCases.Length; i++) + { + FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line")); + TextWriter writer = new StreamWriter(new FileStream(file.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8); + writer.Write(testCases[i]); + writer.WriteLine(); + writer.Dispose(); + try + { + doIndexAndSearchTest(file, null, null); + fail("Some exception should have been thrown for: [" + testCases[i] + "]"); + } +#pragma warning disable 168 + catch (Exception e) +#pragma warning restore 168 + { + // expected. + } + } + } + + /** Doc Name is not part of the default header */ + [Test] + public void TestWithDocsName() + { + FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line")); + createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD); + doIndexAndSearchTest(file, null, DocMaker.NAME_FIELD); + } + + /** Use fields names that are not defined in Docmaker and so will go to Properties */ + [Test] + public void TestWithProperties() + { + FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line")); + String specialField = "mySpecialField"; + createRegularLineFileWithMoreFields(file, specialField); + doIndexAndSearchTest(file, null, specialField); + } + } +}
