Problem with indexing/merging indices - documents not indexed.

[EMAIL PROTECTED] Mon, 06 Dec 2004 14:54:02 -0800

Hello all

After reading the list for more than a year, I've finally decided (got courage) to post my first question. I'm not an expert in Lucene or Java, but I can find my way around it and right now I'm having a problem that I hope this list could help me out with.

I'm using MySQL to store document records (9 document types, about 30-40 fields per document type, various lengths and purpose) which are also indexed with Lucene upon creation and after updates. This is done by calling a JAVA class I wrote, from PHP and passing on necessary parameters for indexing. But for whatever reason documents are not being indexed or indices being merged for that matter.

Attached is a test.java source that I wrote based on my existing code for indexing, as a command line test class that indexes single documents based on two arguments: doctype and fileno. I have tried indexing directly into the existing index or indexing into RAMDirectory and then merging that index with the existing index and in both cases the document does not get merged into the existing FS index (count of documents does not increase in the existing index, and in case of a new index it's 0 and the only file in the index directory is segments)

I have read Otis's article "Advanced Text Indexing with Lucene" and that's where I got the idea to try to first index into RAM and then merge indices instead of directly indexing, just for a sanity test, and while Otis's code works fine, my alteration does not.

I would appreciate any feedback on my code and whether I'm doing something in a wrong way, because I'm at a total loss right now as to why documents are not being indexed at all.


thanks in advance,

-pedja

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateField;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.*;


import java.sql.*;
import java.io.*;

import java.util.Date;
import java.lang.Long;
import java.lang.Exception;

public class test {

  public static void addDoc(IndexWriter writer, String doctype, String 
crofileno) throws IOException {

    String cs             = "jdbc:mysql://localhost/db?user=user&pass=pass";
    String filePath       = "/home/httpd/files/documents";
    String docdir         = filePath + "/" + doctype.substring(0,4) + "/";
    StringBuffer mySpath   = new StringBuffer(docdir);
    StringBuffer myFpath  = new StringBuffer(docdir);
    String sql            = "SELECT * FROM " + doctype + "form WHERE 
crofileno=\"" + crofileno + "\"";

    // make a new, empty document
    Document doc = new Document();

    // establish a connection to MySQL database
    try {
        Class.forName("com.mysql.jdbc.Driver").newInstance();
    } catch (Exception e) {
        System.out.println("Lucene: ERROR: Unable to load driver");
        e.printStackTrace();
    }

    // get the record data...
    try {

       Connection conn = DriverManager.getConnection(cs);
       Statement Stmt = conn.createStatement();
       ResultSet RS = Stmt.executeQuery(sql);

       if(RS.next()) {
          String docstatus = RS.getString(2);
          String publish = RS.getString(3);
          String aol = RS.getString(4);
          String aols = RS.getString(5);
          String court = RS.getString(6);
          String courtfileno = RS.getString(7);
          String casename = RS.getString(8);
          String apj = RS.getString(9);
          String significance = RS.getString(10);
          String keywords = RS.getString(11);
          String medcond = RS.getString(12);

          String summaryFile = RS.getString(14);
          String summarypublish = RS.getString(17);
          String textFile = RS.getString(18);
          String textpublish = RS.getString(20);

          String rellegi = RS.getString(22);
          String intusecom = RS.getString(23);
          String extusecom = RS.getString(24);
          String reldocint = RS.getString(25);
          String relcomint = RS.getString(26);
          String reldocext = RS.getString(27);
          String relcomext = RS.getString(28);

          String dyndate1 = RS.getString(30);
          if (dyndate1.length() > 5)
            dyndate1 = dyndate1.substring(0,10).replaceAll("-","");
          else
            dyndate1 = "";

          String dyndate2 = RS.getString(31);
          if (dyndate2.length() > 5)
            dyndate2 = dyndate2.substring(0,10).replaceAll("-","");
          else
            dyndate2 = "";

          String dyndate3 = RS.getString(32);
          if (dyndate3.length() > 5)
            dyndate3 = dyndate3.substring(0,10).replaceAll("-","");
          else
            dyndate3 = "";

          java.sql.Date filestamp = RS.getDate(33);

          ///remove the -s and replace with spaces
          aol =  aol.replace(',',' ').trim();
          aols =  aols.replace('-',' ').trim();
          court = court.replace('-',' ').trim();
          keywords = keywords.replace('-',' ').trim();
          medcond = medcond.replace('-',' ').trim();

          // add the first group of fields
          //
          doc.add(Field.Keyword("crofileno", crofileno));
          doc.add(Field.Keyword("doctype", doctype));
          doc.add(Field.Keyword("docstatus", docstatus));
          doc.add(Field.Keyword("publish", publish));
          doc.add(Field.Text("aol", aol));
          doc.add(Field.Text("aols", aols));
          doc.add(Field.Text("court", court));
          doc.add(Field.Text("courtfileno", courtfileno));
          doc.add(Field.Text("casename", casename));
          doc.add(Field.Text("apj", apj));
          doc.add(Field.Keyword("significance", significance));
          doc.add(Field.Text("keywords", keywords));
          doc.add(Field.Text("medcond", medcond));


          // start dealing with the files...
          //
          StringBuffer mydate = new StringBuffer(filestamp.toString());
          String myyear = mydate.substring(0,4);
          String mymonth = mydate.substring(5,7);

          if (!summaryFile.equalsIgnoreCase("")) {
                summaryFile =  mySpath.append(myyear + "/").append(mymonth + 
"/").append(crofileno).append("s.doc").toString();
          }
          
          if (!textFile.equalsIgnoreCase("")) {
                textFile =  myFpath.append(myyear + "/").append(mymonth + 
"/").append(crofileno).append("t.doc").toString();
          }

          File summaryF = new File(summaryFile);
          File textF = new File(textFile);

          // Add the contents of the file a field named "summaryfile".  Use a 
Text
          // field, specifying a Reader, so that the text of the file is 
tokenized.

          if (summaryF.exists()) {
             try {
                String[] cmd = { "/usr/local/bin/wvWare", "-1", "-x 
wvText.xml", summaryFile, "2>&1" };
                StringBuffer sb = new StringBuffer();
                Process proc = Runtime.getRuntime().exec(cmd);
                InputStream istr = proc.getInputStream();
                BufferedReader br = new BufferedReader(new 
InputStreamReader(istr));
                String str;
                while ((str = br.readLine()) != null)
                        sb.append(str);
                String s = sb.toString();
                doc.add(Field.Text("summary", s));
                System.out.println("Lucene Conversion: summary<br>");
             }     
             catch (IOException err) {
                err.printStackTrace();
             }
          }

          doc.add(Field.UnIndexed("summarypublish", summarypublish));

          // Add the contents of the file a field named "textfile".  Use a Text
          // field, specifying a Reader, so that the text of the file is 
tokenized.
          if (textF.exists()) {
             try {
                String[] cmd = { "/usr/local/bin/wvWare", "-1", "-x 
wvText.xml", textFile, "2>&1" };
                StringBuffer sb = new StringBuffer();
                Process proc = Runtime.getRuntime().exec(cmd);
                InputStream istr = proc.getInputStream();
                BufferedReader br = new BufferedReader(new 
InputStreamReader(istr));
                String str;
                while ((str = br.readLine()) != null)
                        sb.append(str);
                String s = sb.toString();
                doc.add(Field.UnStored("text", s));
                System.out.println("Lucene Conversion: fulltext<br>");
             }
             catch (Exception err) {
                err.printStackTrace();
             }  
          }

          doc.add(Field.UnIndexed("textpublish", textpublish));


          // add the remaining fields
          //
          doc.add(Field.Text("rellegi", rellegi));
          doc.add(Field.Text("intusecom", intusecom));
          doc.add(Field.Text("extusecom", extusecom));
          doc.add(Field.Text("reldocint", reldocint));
          doc.add(Field.Text("relcomint", relcomint));
          doc.add(Field.Text("reldocext", reldocext));
          doc.add(Field.Text("relcomext", relcomext));

          // add the dates
          //
          doc.add(Field.Keyword("dyndate1", dyndate1));
          doc.add(Field.Keyword("dyndate2", dyndate2));
          doc.add(Field.Keyword("dyndate3", dyndate3));

          System.out.println(writer);
          // add the document
          writer.addDocument(doc);
       }

       RS.close();
       Stmt.close();
       conn.close();

    } catch(SQLException e) {
       System.out.println("Lucene: ERROR: SQLException: " + e);
       System.out.println("Lucene: ERROR: SQLState:     " + e);
       System.out.println("Lucene: ERROR: VendorError:  " + e);
    }
  }


  public static void main(String[] args) throws Exception {
        String indexDir =
            System.getProperty("java.io.tmpdir", "tmp") +
            System.getProperty("file.separator") + "index";

    try{

      Date start                = new Date();
      Analyzer analyzer         = new StandardAnalyzer();
      String doctype            = args[0];
      String crofileno          = args[1];

        /*
      IndexReader reader = IndexReader.open(indexDir);;
      int deleted = reader.delete(new Term("crofileno", crofileno));
      System.out.println("Lucene deleted records: " + deleted + "<br>");
      reader.close();
        */

      // let's make two writers, RAM and FS so that we index to RAM first then 
merge at the end..
      //
      RAMDirectory ramDir       = new RAMDirectory();
      IndexWriter ramWriter        = new IndexWriter(ramDir, analyzer, true);
      addDoc(ramWriter, doctype, crofileno);
      System.out.println("Docs In the RAM index: " + ramWriter.docCount());

      IndexWriter fsWriter      = new IndexWriter(indexDir, analyzer, true);
      //fsWriter.setUseCompoundFile(false);
      //fsWriter.mergeFactor  = 1000;
      //fsWriter.maxMergeDocs = 100000;
      fsWriter.addIndexes(new Directory[] { ramDir });
      //fsWriter.optimize();
      System.out.println("Docs in the FS index: " + fsWriter.docCount());
      ramWriter.close();
      fsWriter.close();

      Date end = new Date();
      System.out.println("Lucene Added OK: " + Long.toString(end.getTime() - 
start.getTime()) + " total milliseconds<br>");

    } catch (IOException e) {
        throw new Exception("Something bad happened: " + e.getClass() + " with 
message: " + e.getMessage());
    } catch (Exception e) {
        throw new Exception(" caught a " + e.getClass() + "\n with message: " + 
e.getMessage());
    }
  }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Problem with indexing/merging indices - documents not indexed.

Reply via email to