Re: Boolean Query when indexing each line as a document.

Ankit Murarka Wed, 14 Aug 2013 07:32:13 -0700

Hello. The problem  is as follows:

I have a document containing information in lines. So I am indexing allfiles line by line.

So If I say in my document I have,
             INSIDE POST OF SERVER\
and in my index file created I have,
             INSIDE POST OF SERVER\

and I fire a boolean query with INSIDE and POST with MUST/MUST, I amgetting no HIT.

I am providing the complete CODE I am using to create INDEX and TOSEARCH..Both are drawn from sample code present online.


/*INDEX CODE:
*/
package org.RunAllQueriesWithLineByLinePhrases;

public class CreateIndex {
  public static void main(String[] args) {

String indexPath = "D:\\INDEXFORQUERY"; //Place where indexes willbe created

    String docsPath="Indexed";    //Place where the files are kept.
    boolean create=true;
   final File docDir = new File(docsPath);
   if (!docDir.exists() || !docDir.canRead()) {
       System.exit(1);
    }
   try {
     Directory dir = FSDirectory.open(new File(indexPath));

Analyzer analyzer=newCustomAnalyzerForCaseSensitive(Version.LUCENE_44);IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44,analyzer);

      if (create) {
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
      } else {

System.out.println("Trying to set IWC mode to UPDATE...NOTDESIRED..");

     }
      IndexWriter writer = new IndexWriter(dir, iwc);
      indexDocs(writer, docDir);
      writer.close();
    } catch (IOException e) {
      System.out.println(" caught a " + e.getClass() +
       "\n with message: " + e.getMessage());
    }
 }
  static void indexDocs(IndexWriter writer, File file)
    throws IOException {
   if (file.canRead())
   {
      if (file.isDirectory()) {
       String[] files = file.list();
        if (files != null) {
          for (int i = 0; i < files.length; i++) {
              if(files[i]!=null)
            indexDocs(writer, new File(file, files[i]));
          }
        }
     } else {
        try {
          Document doc = new Document();

Field pathField = new StringField("path", file.getPath(),Field.Store.YES);

          doc.add(pathField);

doc.add(new LongField("modified", file.lastModified(),Field.Store.NO));

          LineNumberReader lnr=new LineNumberReader(new FileReader(file));
         String line=null;
          while( null != (line = lnr.readLine()) ){
              doc.add(new StringField("contents",line,Field.Store.YES));
          }
          if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            writer.addDocument(doc);
          } else {
            writer.updateDocument(new Term("path", file.getPath()), doc);
          }
        } finally {
        }
      }
    }
  } }

/*SEARCHING CODE:-*/

package org.RunAllQueriesWithLineByLinePhrases;

public class SearchFORALLQUERIES {
  public static void main(String[] args) throws Exception {

    String[] argument=new String[20];
    argument[0]="-index";
    argument[1]="D:\\INDEXFORQUERY";
    argument[2]="-field";
    argument[3]="contents";  //field value
    argument[4]="-repeat";
    argument[5]="2";   //repeat value
    argument[6]="-raw";
    argument[7]="-paging";
    argument[8]="300";   //paging value

    String index = "index";
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;

    for(int i = 0;i < argument.length;i++) {
      if ("-index".equals(argument[i])) {
        index = argument[i+1];
        i++;
      } else if ("-field".equals(argument[i])) {
        field = argument[i+1];
        i++;
      } else if ("-queries".equals(argument[i])) {
        queries = argument[i+1];
        i++;
      } else if ("-query".equals(argument[i])) {
        queryString = argument[i+1];
        i++;
      } else if ("-repeat".equals(argument[i])) {
        repeat = Integer.parseInt(argument[i+1]);
        i++;
      } else if ("-raw".equals(argument[i])) {

raw = true; //set it true to just display the count. If falsethen it also display file name.

      } else if ("-paging".equals(argument[i])) {
        hitsPerPage = Integer.parseInt(argument[i+1]);
        if (hitsPerPage <= 0) {
          System.err.println("There must be at least 1 hit per page.");
          System.exit(1);
       }
       i++;
     }
   }
    System.out.println("processing input");

IndexReader reader = DirectoryReader.open(FSDirectory.open(newFile(index))); //location where indexes are.

   IndexSearcher searcher = new IndexSearcher(reader);
   BufferedReader in = null;
   if (queries != null) {

in = new BufferedReader(new InputStreamReader(newFileInputStream(queries), "UTF-8")); //provide query as input

   } else {
     in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
   }
   while (true) {

if (queries == null && queryString == null){ // prompt the userSystem.out.println("Enter query: "); //if query is notpresent, prompt the user to enter query.

     }
     String line = queryString != null ? queryString : in.readLine();

     if (line == null || line.length() == -1) {
       break;
     }
     line = line.trim();
     if (line.length() == 0) {
       break;
     }
String[] str=line.split(" ");
 System.out.println("queries are "  + str[0] + " and is  "  + str[1]);
  Query query1 = new TermQuery(new Term(field, str[0]));
  Query query2=new TermQuery(new Term(field,str[1]));
      BooleanQuery booleanQuery = new BooleanQuery();
    booleanQuery.add(query1, BooleanClause.Occur.MUST);
    booleanQuery.add(query2, BooleanClause.Occur.MUST);

if (repeat > 0) { //repeat=2 // repeat& time as benchmark

       Date start = new Date();
        for (int i = 0; i < repeat; i++) {
          searcher.search(booleanQuery, null, 100);
        }
        Date end = new Date();
        System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");
      }

doPagingSearch(in, searcher, booleanQuery, hitsPerPage, raw,queries == null && queryString == null);

      if (queryString != null) {
        break;
      }
    }
    reader.close();
  }

public static void doPagingSearch(BufferedReader in, IndexSearchersearcher, Query query,int hitsPerPage, boolean raw,boolean interactive) throws IOException {

    TopDocs results = searcher.search(query, 5 * hitsPerPage);
    ScoreDoc[] hits = results.scoreDocs;
    int numTotalHits = results.totalHits;
    System.out.println(numTotalHits + " total matching documents");
    int start = 0;
    int end = Math.min(numTotalHits, hitsPerPage);
    while (true) {
      if (end > hits.length) {

System.out.println("Only results 1 - " + hits.length +" of " +numTotalHits + " total matching documents collected.");

        System.out.println("Collect more (y/n) ?");
        String line = in.readLine();
        if (line.length() == 0 || line.charAt(0) == 'n') {
          break;
        }
        hits = searcher.search(query, numTotalHits).scoreDocs;
      }
      end = Math.min(hits.length, start + hitsPerPage);   //3 and 5.
      for (int i = start; i < end; i++) {  //0 to 3.
        if (raw) {

          System.out.println("doc="+hits[i].doc+" score="+hits[i].score);
        }
        Document doc = searcher.doc(hits[i].doc);
        List<IndexableField> filed=doc.getFields();
        filed.size();
        String path = doc.get("path");
        if (path != null) {
          System.out.println((i+1) + ". " + path);
          String title = doc.get("title");
          if (title != null) {
            System.out.println("   Title: " + doc.get("title"));
          }
        } else {
          System.out.println((i+1) + ". " + "No path for this document");
        }
      }
      if (!interactive || end == 0) {
        break;
      }
      if (numTotalHits >= end) {
        boolean quit = false;
        while (true) {
          System.out.print("Press ");
          if (start - hitsPerPage >= 0) {
            System.out.print("(p)revious page, ");
          }
          if (start + hitsPerPage < numTotalHits) {
            System.out.print("(n)ext page, ");
          }
          System.out.println("(q)uit or enter number to jump to a page.");
          String line = in.readLine();
          if (line.length() == 0 || line.charAt(0)=='q') {
            quit = true;
            break;
          }
          if (line.charAt(0) == 'p') {
            start = Math.max(0, start - hitsPerPage);
            break;
          } else if (line.charAt(0) == 'n') {
            if (start + hitsPerPage < numTotalHits) {
              start+=hitsPerPage;
            }
            break;
          } else {
            int page = Integer.parseInt(line);
            if ((page - 1) * hitsPerPage < numTotalHits) {
              start = (page - 1) * hitsPerPage;
              break;
            } else {
              System.out.println("No such page");
            }
          }
        }
        if (quit) break;
        end = Math.min(numTotalHits, start + hitsPerPage);
      }
    }
  }
}

/*CUSTOM ANALYZER CODE:*/

package com.rancore.demo;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;

public class CustomAnalyzerForCaseSensitive extends StopwordAnalyzerBase {

      public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
      private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

public static final CharArraySet STOP_WORDS_SET =StopAnalyzer.ENGLISH_STOP_WORDS_SET;public CustomAnalyzerForCaseSensitive(Version matchVersion,CharArraySet stopWords) {

        super(matchVersion, stopWords);
      }
      public CustomAnalyzerForCaseSensitive(Version matchVersion) {
        this(matchVersion, STOP_WORDS_SET);
      }

public CustomAnalyzerForCaseSensitive(Version matchVersion,Reader stopwords) throws IOException {

            this(matchVersion, loadStopwordSet(stopwords, matchVersion));
          }
      public void setMaxTokenLength(int length) {
            maxTokenLength = length;
          }
          /**
           * @see #setMaxTokenLength
           */
          public int getMaxTokenLength() {
            return maxTokenLength;
          }
    @Override

protected TokenStreamComponents createComponents(final StringfieldName, final Reader reader) {final StandardTokenizer src = newStandardTokenizer(matchVersion, reader);

            src.setMaxTokenLength(maxTokenLength);
            TokenStream tok = new StandardFilter(matchVersion, src);
           // tok = new LowerCaseFilter(matchVersion, tok);
            tok = new StopFilter(matchVersion, tok, stopwords);
            return new TokenStreamComponents(src, tok) {
              @Override

protected void setReader(final Reader reader) throwsIOException {src.setMaxTokenLength(CustomAnalyzerForCaseSensitive.this.maxTokenLength);

                super.setReader(reader);
              }
            };
    }
}



I HOPE I HAVE GIVEN THE COMPLETE CODE SAMPLE FOR PEOPLE TO WORK ON..

PLEASE GUIDE ME NOW: IN case any further information is required pleaselet me know.



On 8/14/2013 7:43 PM, Ian Lea wrote:

Well, you have supplied a bit more info - good - but I still can't
spot the problem.  Unless someone else can I suggest you post a very
small self-contained program that demonstrates the problem.


--
Ian.


On Wed, Aug 14, 2013 at 2:50 PM, Ankit Murarka
<[email protected]>  wrote:

Hello.
         The problem does not seem to be getting solved.

As mentioned, I am indexing each line of each file.
The sample text present inside LUKE is

<am name="notification" value="10"/>\
<type="DE">\
java.lang.Thread.run(Thread.java:619)

Size of list  array::0\

at java.lang.reflect.Method.invoke(Method.java:597)
org.com.dummy,INFO,<<  Still figuring out how to run

,SERVER,100.100.100.100:8080,EXCEPTION,10613349

INSIDE POST OF Listener\

In my Luke, I can see the text as "INSIDE POST OF Listener" .. This is
present in many files.

/*Query is +contents:INSIDE contents:POST */              --/The field name
is contents. Same analyzer is being used. This is a boolean query./

To test, I indexed only 20 files. In 19 files, this is present.

The boolean query should give me a hit for this document.

BUT IT IS RETURNING ME NO HIT..

If I index the same files WITHOUT line by line then, it gives me proper
hits..

But for me it should work on Indexes created by Line by Line parsing also.

Please guide.





On 8/13/2013 4:41 PM, Ian Lea wrote:

remedialaction != "remedial action"?

Show us your query.  Show a small self-contained sample program or
test case that demonstrates the problem.  You need to give us
something more to go on.


--
Ian.


On Tue, Aug 13, 2013 at 11:13 AM, Ankit Murarka
<[email protected]>   wrote:

Hello,
          I am aware of that link and I have been through that link many
number of times.

Problem I have is:

1. Each line is indexed. So indexed line looks something like "<attribute
name="remedial action" value="Checking"/>\"
2. I am easily firing a phrase query on this line. It suggest me the
possible values. No problem,.
3. If I fire a Boolean Query with "remedialaction" and "Checking" as a
must/must , then it is not providing me this document as a hit.
4. I am using StandardAnalyzer both during the indexing and searching
time.


On 8/13/2013 2:31 PM, Ian Lea wrote:

Should be straightforward enough.  Work through the tips in the FAQ
entry at

http://wiki.apache.org/lucene-java/LuceneFAQ#Why_am_I_getting_no_hits_.2F_incorrect_hits.3F
and post back if that doesn't help, with details of how you are
analyzing the data and how you are searching.


--
Ian.


On Tue, Aug 13, 2013 at 8:56 AM, Ankit Murarka
<[email protected]>    wrote:

Hello All,
                  I have 2 different usecases.
I am trying to provide both boolean query and phrase search query in
the
application.

In every line of the document which I am indexing I have content like :

<attribute name="remedial action" value="Checking"/>\

Due to the phrase search requirement, I am indexing each line of the
file
as
a new document.

Now when I am trying to do a phrase query (Did you Mean, Infix Analyzer
etc,
or phrase suggest) this seems to work fine and provide me with desired
suggestions.

Problem is :

How do I invoke boolean query for this. I mean when I verified the
indexes
in Luke, I saw the whole line as expected is indexed.

So, if user wish to perform a boolean query say suppose containing
"remedialaction" and "Checking" how do I get this document as a hit. I
believe since I am indexing each line, this seems to be bit tricky.

Please guide.

--
Regards

Ankit


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



--
Regards

Ankit Murarka

"What lies behind us and what lies before us are tiny matters compared
with
what lies within us"


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



--
Regards

Ankit Murarka

"What lies behind us and what lies before us are tiny matters compared with
what lies within us"

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]



--
Regards

Ankit Murarka

"What lies behind us and what lies before us are tiny matters compared with what 
lies within us"

Re: Boolean Query when indexing each line as a document.

Reply via email to