Hello. The problem is as follows:
I have a document containing information in lines. So I am
indexing all
files line by line.
So If I say in my document I have,
INSIDE POST OF SERVER\
and in my index file created I have,
INSIDE POST OF SERVER\
and I fire a boolean query with INSIDE and POST with MUST/MUST, I am
getting
no HIT.
I am providing the complete CODE I am using to create INDEX and TO
SEARCH..Both are drawn from sample code present online.
/*INDEX CODE:
*/
package org.RunAllQueriesWithLineByLinePhrases;
public class CreateIndex {
public static void main(String[] args) {
String indexPath = "D:\\INDEXFORQUERY"; //Place where
indexes will
be
created
String docsPath="Indexed"; //Place where the files are kept.
boolean create=true;
final File docDir = new File(docsPath);
if (!docDir.exists() || !docDir.canRead()) {
System.exit(1);
}
try {
Directory dir = FSDirectory.open(new File(indexPath));
Analyzer analyzer=new
CustomAnalyzerForCaseSensitive(Version.LUCENE_44);
IndexWriterConfig iwc = new
IndexWriterConfig(Version.LUCENE_44,
analyzer);
if (create) {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
} else {
System.out.println("Trying to set IWC mode to
UPDATE...NOT
DESIRED..");
}
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
static void indexDocs(IndexWriter writer, File file)
throws IOException {
if (file.canRead())
{
if (file.isDirectory()) {
String[] files = file.list();
if (files != null) {
for (int i = 0; i< files.length; i++) {
if(files[i]!=null)
indexDocs(writer, new File(file, files[i]));
}
}
} else {
try {
Document doc = new Document();
Field pathField = new StringField("path", file.getPath(),
Field.Store.YES);
doc.add(pathField);
doc.add(new LongField("modified", file.lastModified(),
Field.Store.NO));
LineNumberReader lnr=new LineNumberReader(new
FileReader(file));
String line=null;
while( null != (line = lnr.readLine()) ){
doc.add(new
StringField("contents",line,Field.Store.YES));
}
if (writer.getConfig().getOpenMode() ==
OpenMode.CREATE) {
writer.addDocument(doc);
} else {
writer.updateDocument(new Term("path", file.getPath()),
doc);
}
} finally {
}
}
}
} }
/*SEARCHING CODE:-*/
package org.RunAllQueriesWithLineByLinePhrases;
public class SearchFORALLQUERIES {
public static void main(String[] args) throws Exception {
String[] argument=new String[20];
argument[0]="-index";
argument[1]="D:\\INDEXFORQUERY";
argument[2]="-field";
argument[3]="contents"; //field value
argument[4]="-repeat";
argument[5]="2"; //repeat value
argument[6]="-raw";
argument[7]="-paging";
argument[8]="300"; //paging value
String index = "index";
String field = "contents";
String queries = null;
int repeat = 0;
boolean raw = false;
String queryString = null;
int hitsPerPage = 10;
for(int i = 0;i< argument.length;i++) {
if ("-index".equals(argument[i])) {
index = argument[i+1];
i++;
} else if ("-field".equals(argument[i])) {
field = argument[i+1];
i++;
} else if ("-queries".equals(argument[i])) {
queries = argument[i+1];
i++;
} else if ("-query".equals(argument[i])) {
queryString = argument[i+1];
i++;
} else if ("-repeat".equals(argument[i])) {
repeat = Integer.parseInt(argument[i+1]);
i++;
} else if ("-raw".equals(argument[i])) {
raw = true; //set it true to just display the count.
If false
then
it also display file name.
} else if ("-paging".equals(argument[i])) {
hitsPerPage = Integer.parseInt(argument[i+1]);
if (hitsPerPage<= 0) {
System.err.println("There must be at least 1 hit per
page.");
System.exit(1);
}
i++;
}
}
System.out.println("processing input");
IndexReader reader = DirectoryReader.open(FSDirectory.open(new
File(index))); //location where indexes are.
IndexSearcher searcher = new IndexSearcher(reader);
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new InputStreamReader(new
FileInputStream(queries), "UTF-8")); //provide query as input
} else {
in = new BufferedReader(new InputStreamReader(System.in,
"UTF-8"));
}
while (true) {
if (queries == null&& queryString == null) {
//
prompt the user
System.out.println("Enter query: "); //if query is not
present,
prompt the user to enter query.
}
String line = queryString != null ? queryString :
in.readLine();
if (line == null || line.length() == -1) {
break;
}
line = line.trim();
if (line.length() == 0) {
break;
}
String[] str=line.split(" ");
System.out.println("queries are " + str[0] + " and is " +
str[1]);
Query query1 = new TermQuery(new Term(field, str[0]));
Query query2=new TermQuery(new Term(field,str[1]));
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(query1, BooleanClause.Occur.MUST);
booleanQuery.add(query2, BooleanClause.Occur.MUST);
if (repeat> 0) { //repeat=2 //
repeat&
time as benchmark
Date start = new Date();
for (int i = 0; i< repeat; i++) {
searcher.search(booleanQuery, null, 100);
}
Date end = new Date();
System.out.println("Time:
"+(end.getTime()-start.getTime())+"ms");
}
doPagingSearch(in, searcher, booleanQuery, hitsPerPage, raw,
queries
== null&& queryString == null);
if (queryString != null) {
break;
}
}
reader.close();
}
public static void doPagingSearch(BufferedReader in,
IndexSearcher
searcher, Query query,
int hitsPerPage, boolean raw,
boolean
interactive) throws IOException {
TopDocs results = searcher.search(query, 5 * hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents");
int start = 0;
int end = Math.min(numTotalHits, hitsPerPage);
while (true) {
if (end> hits.length) {
System.out.println("Only results 1 - " + hits.length +"
of " +
numTotalHits + " total matching documents collected.");
System.out.println("Collect more (y/n) ?");
String line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'n') {
break;
}
hits = searcher.search(query, numTotalHits).scoreDocs;
}
end = Math.min(hits.length, start + hitsPerPage); //3
and 5.
for (int i = start; i< end; i++) { //0 to 3.
if (raw) {
System.out.println("doc="+hits[i].doc+"
score="+hits[i].score);
}
Document doc = searcher.doc(hits[i].doc);
List<IndexableField> filed=doc.getFields();
filed.size();
String path = doc.get("path");
if (path != null) {
System.out.println((i+1) + ". " + path);
String title = doc.get("title");
if (title != null) {
System.out.println(" Title: " + doc.get("title"));
}
} else {
System.out.println((i+1) + ". " + "No path for this
document");
}
}
if (!interactive || end == 0) {
break;
}
if (numTotalHits>= end) {
boolean quit = false;
while (true) {
System.out.print("Press ");
if (start - hitsPerPage>= 0) {
System.out.print("(p)revious page, ");
}
if (start + hitsPerPage< numTotalHits) {
System.out.print("(n)ext page, ");
}
System.out.println("(q)uit or enter number to jump to a
page.");
String line = in.readLine();
if (line.length() == 0 || line.charAt(0)=='q') {
quit = true;
break;
}
if (line.charAt(0) == 'p') {
start = Math.max(0, start - hitsPerPage);
break;
} else if (line.charAt(0) == 'n') {
if (start + hitsPerPage< numTotalHits) {
start+=hitsPerPage;
}
break;
} else {
int page = Integer.parseInt(line);
if ((page - 1) * hitsPerPage< numTotalHits) {
start = (page - 1) * hitsPerPage;
break;
} else {
System.out.println("No such page");
}
}
}
if (quit) break;
end = Math.min(numTotalHits, start + hitsPerPage);
}
}
}
}
/*CUSTOM ANALYZER CODE:*/
package com.rancore.demo;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
public class CustomAnalyzerForCaseSensitive extends
StopwordAnalyzerBase
{
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
public static final CharArraySet STOP_WORDS_SET =
StopAnalyzer.ENGLISH_STOP_WORDS_SET;
public CustomAnalyzerForCaseSensitive(Version matchVersion,
CharArraySet stopWords) {
super(matchVersion, stopWords);
}
public CustomAnalyzerForCaseSensitive(Version matchVersion) {
this(matchVersion, STOP_WORDS_SET);
}
public CustomAnalyzerForCaseSensitive(Version
matchVersion, Reader
stopwords) throws IOException {
this(matchVersion, loadStopwordSet(stopwords,
matchVersion));
}
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
/**
* @see #setMaxTokenLength
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
protected TokenStreamComponents createComponents(final String
fieldName,
final Reader reader) {
final StandardTokenizer src = new
StandardTokenizer(matchVersion,
reader);
src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new StandardFilter(matchVersion,
src);
// tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) throws
IOException {
src.setMaxTokenLength(CustomAnalyzerForCaseSensitive.this.maxTokenLength);
super.setReader(reader);
}
};
}
}
I HOPE I HAVE GIVEN THE COMPLETE CODE SAMPLE FOR PEOPLE TO WORK ON..
PLEASE GUIDE ME NOW: IN case any further information is required
please
let
me know.
On 8/14/2013 7:43 PM, Ian Lea wrote:
Well, you have supplied a bit more info - good - but I still can't
spot the problem. Unless someone else can I suggest you post a very
small self-contained program that demonstrates the problem.
--
Ian.
On Wed, Aug 14, 2013 at 2:50 PM, Ankit Murarka
<ankit.mura...@rancoretech.com> wrote:
Hello.
The problem does not seem to be getting solved.
As mentioned, I am indexing each line of each file.
The sample text present inside LUKE is
<am name="notification" value="10"/>\
<type="DE">\
java.lang.Thread.run(Thread.java:619)
Size of list array::0\
at java.lang.reflect.Method.invoke(Method.java:597)
org.com.dummy,INFO,<< Still figuring out how to run
,SERVER,100.100.100.100:8080,EXCEPTION,10613349
INSIDE POST OF Listener\
In my Luke, I can see the text as "INSIDE POST OF Listener" ..
This is
present in many files.
/*Query is +contents:INSIDE contents:POST */ --/The
field
name
is contents. Same analyzer is being used. This is a boolean query./
To test, I indexed only 20 files. In 19 files, this is present.
The boolean query should give me a hit for this document.
BUT IT IS RETURNING ME NO HIT..
If I index the same files WITHOUT line by line then, it gives me
proper
hits..
But for me it should work on Indexes created by Line by Line
parsing
also.
Please guide.
On 8/13/2013 4:41 PM, Ian Lea wrote:
remedialaction != "remedial action"?
Show us your query. Show a small self-contained sample program or
test case that demonstrates the problem. You need to give us
something more to go on.
--
Ian.
On Tue, Aug 13, 2013 at 11:13 AM, Ankit Murarka
<ankit.mura...@rancoretech.com> wrote:
Hello,
I am aware of that link and I have been through
that link
many
number of times.
Problem I have is:
1. Each line is indexed. So indexed line looks something like
"<attribute
name="remedial action" value="Checking"/>\"
2. I am easily firing a phrase query on this line. It suggest
me the
possible values. No problem,.
3. If I fire a Boolean Query with "remedialaction" and
"Checking" as
a
must/must , then it is not providing me this document as a hit.
4. I am using StandardAnalyzer both during the indexing and
searching
time.
On 8/13/2013 2:31 PM, Ian Lea wrote:
Should be straightforward enough. Work through the tips in
the FAQ
entry at
http://wiki.apache.org/lucene-java/LuceneFAQ#Why_am_I_getting_no_hits_.2F_incorrect_hits.3F
and post back if that doesn't help, with details of how you are
analyzing the data and how you are searching.
--
Ian.
On Tue, Aug 13, 2013 at 8:56 AM, Ankit Murarka
<ankit.mura...@rancoretech.com> wrote:
Hello All,
I have 2 different usecases.
I am trying to provide both boolean query and phrase search
query
in
the
application.
In every line of the document which I am indexing I have
content
like
:
<attribute name="remedial action" value="Checking"/>\
Due to the phrase search requirement, I am indexing each
line of
the
file
as
a new document.
Now when I am trying to do a phrase query (Did you Mean, Infix
Analyzer
etc,
or phrase suggest) this seems to work fine and provide me with
desired
suggestions.
Problem is :
How do I invoke boolean query for this. I mean when I
verified the
indexes
in Luke, I saw the whole line as expected is indexed.
So, if user wish to perform a boolean query say suppose
containing
"remedialaction" and "Checking" how do I get this document as a
hit.
I
believe since I am indexing each line, this seems to be bit
tricky.
Please guide.
--
Regards
Ankit
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail:
java-user-h...@lucene.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail:
java-user-h...@lucene.apache.org
--
Regards
Ankit Murarka
"What lies behind us and what lies before us are tiny matters
compared
with
what lies within us"
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail: java-user-h...@lucene.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail: java-user-h...@lucene.apache.org
--
Regards
Ankit Murarka
"What lies behind us and what lies before us are tiny matters
compared
with
what lies within us"
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail: java-user-h...@lucene.apache.org
--
Regards
Ankit Murarka
"What lies behind us and what lies before us are tiny matters
compared
with
what lies within us"