[incubator-pinot] branch master updated: Prune stop words for text index (#5297)

siddteotia Mon, 27 Apr 2020 16:35:54 -0700

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new cb62dfc  Prune stop words for text index (#5297)
cb62dfc is described below

commit cb62dfcc92f8bd82287fd934a37076fbd59bfc75
Author: Sidd <[email protected]>
AuthorDate: Mon Apr 27 16:35:04 2020 -0700

    Prune stop words for text index (#5297)
    
    Co-authored-by: Siddharth Teotia <[email protected]>
---
 .../creator/impl/inv/text/LuceneTextIndexCreator.java | 19 ++++++++++++++++++-
 .../index/readers/text/LuceneTextIndexReader.java     |  2 +-
 .../apache/pinot/queries/TextSearchQueriesTest.java   | 15 +++++++++++++++
 .../test/resources/data/text_search_data/skills.txt   |  3 ++-
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/text/LuceneTextIndexCreator.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/text/LuceneTextIndexCreator.java
index 2d7ecf9..32786d7 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/text/LuceneTextIndexCreator.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/inv/text/LuceneTextIndexCreator.java
@@ -20,7 +20,15 @@ package org.apache.pinot.core.segment.creator.impl.inv.text;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
+import org.apache.lucene.analysis.core.StopFilterFactory;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StoredField;
@@ -50,6 +58,15 @@ public class LuceneTextIndexCreator implements 
InvertedIndexCreator {
   private final Directory _indexDirectory;
   private final IndexWriter _indexWriter;
 
+  public static final CharArraySet ENGLISH_STOP_WORDS_SET =
+      new CharArraySet(Arrays.asList(
+          "a", "an", "and", "are", "as", "at", "be", "but", "by",
+          "for", "if", "in", "into", "is", "it",
+          "no", "not", "of", "on", "or", "such",
+          "that", "the", "their", "then", "than", "there", "these",
+          "they", "this", "to", "was", "will", "with", "those"
+      ), true);
+
   /**
    * Called by {@link 
org.apache.pinot.core.segment.creator.impl.SegmentColumnarIndexCreator}
    * when building an offline segment. Similar to how it creates per column
@@ -81,7 +98,7 @@ public class LuceneTextIndexCreator implements 
InvertedIndexCreator {
       // to V3 if segmentVersion is set to V3 in SegmentGeneratorConfig.
       File indexFile = getV1TextIndexFile(segmentIndexDir);
       _indexDirectory = FSDirectory.open(indexFile.toPath());
-      StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
+      StandardAnalyzer standardAnalyzer = new 
StandardAnalyzer(ENGLISH_STOP_WORDS_SET);
       IndexWriterConfig indexWriterConfig = new 
IndexWriterConfig(standardAnalyzer);
       indexWriterConfig.setRAMBufferSizeMB(LUCENE_INDEX_MAX_BUFFER_SIZE_MB);
       indexWriterConfig.setCommitOnClose(commit);
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
index c5157c5..fa48b0d 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/text/LuceneTextIndexReader.java
@@ -85,7 +85,7 @@ public class LuceneTextIndexReader implements 
InvertedIndexReader<MutableRoaring
       // TODO: consider using a threshold of num docs per segment to decide 
between building
       // mapping file upfront on segment load v/s on-the-fly during query 
processing
       _docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, 
_indexSearcher);
-      _standardAnalyzer = new StandardAnalyzer();
+      _standardAnalyzer = new 
StandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
     } catch (Exception e) {
       LOGGER
           .error("Failed to instantiate Lucene text index reader for column 
{}, exception {}", column, e.getMessage());
diff --git 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index d071be0..b2d1d2d 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -1163,6 +1163,21 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
     expected.add(new Serializable[]{1020, "Databases, columnar query 
processing, Apache Arrow, distributed systems, Machine learning, cluster 
management, docker image building and distribution"});
     expected.add(new Serializable[]{1020, "Databases, columnar query 
processing, Apache Arrow, distributed systems, Machine learning, cluster 
management, docker image building and distribution"});
     testInterSegmentSelectionQueryHelper(query, expected);
+
+    // query with only stop-words. they should not be indexed
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 'a 
and or in the are')";
+    testInterSegmentAggregationQueryHelper(query, 0);
+    // analyzer should prune/ignore the stop words from search expression and 
consider everything else for a match
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 
'\"learned a lot\"')";
+    testInterSegmentAggregationQueryHelper(query, 4);
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 
'\"indexing and transaction processing\"')";
+    testInterSegmentAggregationQueryHelper(query, 12);
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 
'\"docker image building and distribution\"')";
+    testInterSegmentAggregationQueryHelper(query, 8);
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 
'\"distributed query engines for analytics and data warehouses\"')";
+    testInterSegmentAggregationQueryHelper(query, 8);
+    query = "SELECT count(*) FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, 
'\"worked in NGO\"')";
+    testInterSegmentAggregationQueryHelper(query, 4);
   }
 
   private void testInterSegmentAggregationQueryHelper(String query, long 
expectedCount) {
diff --git a/pinot-core/src/test/resources/data/text_search_data/skills.txt 
b/pinot-core/src/test/resources/data/text_search_data/skills.txt
index 995f70f..3629274 100644
--- a/pinot-core/src/test/resources/data/text_search_data/skills.txt
+++ b/pinot-core/src/test/resources/data/text_search_data/skills.txt
@@ -20,4 +20,5 @@ Realtime stream processing, publish subscribe, columnar 
processing for data ware
 C++, Java, Python, realtime streaming systems, Machine learning, spark, 
Kubernetes, transaction processing, distributed storage, concurrency, 
multi-threading, apache airflow
 Databases, columnar query processing, Apache Arrow, distributed systems, 
Machine learning, cluster management, docker image building and distribution
 Database engine, OLAP systems, OLTP transaction processing at large scale, 
concurrency, multi-threading, GO, building large scale systems
-GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) 
Gecko/20100101 Firefox/34.0 - NullPointerException
\ No newline at end of file
+GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) 
Gecko/20100101 Firefox/34.0 - NullPointerException
+Foo worked in a lot of places and learned a lot of things
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[incubator-pinot] branch master updated: Prune stop words for text index (#5297)

Reply via email to