RE: TVD, TVX and TVF files

Luis Paiva Mon, 02 Apr 2012 10:29:18 -0700

Thank you for your help. 
I still haven't found a solution yet. I'm copying all my code below.


BTW, I'm working with lucene version 3.5.0

@Mike: Yes i do close it :) I have some files created, that are: .fdt, .fdx,
.fnm, .frq, .nrm, .prx, .tii, .tis.

Don't know why the files T* are not created. 

@Uwe: I think I'm not getting any compound files. Only those above. 

Anyone has the same issue? 



CODE --------------------------- xx -------------------------------


package lucene;

import java.io.*;
import java.util.ArrayList;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * This terminal application creates an Apache Lucene index in a folder and
adds files into this index
 * based on the input of the user.
 */
public class TextFileIndexer {

  private IndexWriter writer;
  private ArrayList<File> queue = new ArrayList<File>();

  public static void main(String[] args) throws IOException {
    System.out.println("Enter the path where the index will be created: ");

    BufferedReader br = new BufferedReader(
            new InputStreamReader(System.in));
    String s = br.readLine();

    TextFileIndexer indexer = null;
    try {
      indexer = new TextFileIndexer(s);
    } catch (Exception ex) {
      System.out.println("Cannot create index..." + ex.getMessage());
      System.exit(-1);
    }

    //===================================================
    //read input from user until he enters q for quit
    //===================================================
    while (!s.equalsIgnoreCase("q")) {
      try {
        System.out.println("Enter the file or folder name to add into the
index (q=quit):");
        System.out.println("[Acceptable file types: .xml, .html, .html,
.txt]");
        s = br.readLine();
        if (s.equalsIgnoreCase("q")) {
          break;
        }

        //try to add file into the index
        indexer.indexFileOrDirectory(s);
      } catch (Exception e) {
        System.out.println("Error indexing " + s + " : " + e.getMessage());
      }
    }

    //===================================================
    //after adding, we always have to call the
    //closeIndex, otherwise the index is not created    
    //===================================================
    indexer.closeIndex();
  }

  /**
   * Constructor
   * @param indexDir the name of the folder in which the index should be
created
   * @throws java.io.IOException
   */
  TextFileIndexer(String indexDir) throws IOException {
    // the boolean true parameter means to create a new index everytime, 
    // potentially overwriting any existing files there.
    FSDirectory dir = FSDirectory.open(new File(indexDir));

    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_34,
analyzer);

    writer = new IndexWriter(dir, config);
  }

  /**
   * Indexes a file or directory
   * @param fileName the name of a text file or a folder we wish to add to
the index
   * @throws java.io.IOException
   */
  public void indexFileOrDirectory(String fileName) throws IOException {
    //===================================================
    //gets the list of files in a folder (if user has submitted
    //the name of a folder) or gets a single file name (is user
    //has submitted only the file name) 
    //===================================================
    addFiles(new File(fileName));
    
    int originalNumDocs = writer.numDocs();
    for (File f : queue) {
      FileReader fr = null;
      try {
        Document doc = new Document();

        //===================================================
        // add contents of file
        //===================================================
        fr = new FileReader(f);
        doc.add(new Field("contents", fr));
        
        

        //===================================================
        //adding second field which contains the path of the file
        //===================================================
        doc.add(new Field("path", fileName,
                Field.Store.YES,
                Field.Index.NOT_ANALYZED));

        writer.addDocument(doc);
        System.out.println("Added: " + f);
      } catch (Exception e) {
        System.out.println("Could not add: " + f);
      } finally {
        fr.close();
      }
    }
    
    int newNumDocs = writer.numDocs();
    System.out.println("");
    System.out.println("************************");
    System.out.println((newNumDocs - originalNumDocs) + " documents
added.");
    System.out.println("************************");

    queue.clear();
  }

  private void addFiles(File file) {

    if (!file.exists()) {
      System.out.println(file + " does not exist.");
    }
    if (file.isDirectory()) {
      for (File f : file.listFiles()) {
        addFiles(f);
      }
    } else {
      String filename = file.getName().toLowerCase();
      //===================================================
      // Only index text files
      //===================================================
      if (filename.endsWith(".htm") || filename.endsWith(".html") || 
              filename.endsWith(".xml") || filename.endsWith(".txt")) {
        queue.add(file);
      } else {
        System.out.println("Skipped " + filename);
      }
    }
  }

  /**
   * Close the index.
   * @throws java.io.IOException
   */
  public void closeIndex() throws IOException {
    writer.close();
  }
}

END OF CODE --------------------------- xx -------------------------------


-----Mensagem original-----
De: Uwe Schindler [mailto:u...@thetaphi.de] 
Enviada: terça-feira, 27 de Março de 2012 19:19
Para: java-user@lucene.apache.org
Assunto: RE: TVD, TVX and TVF files

Maybe you only see CFS files? If this is the case, your index is in compound
file format. In that case (the default), to get the raw files, disable
compound files in the merge policy!

-----
Uwe Schindler
H.-H.-Meier-Allee 63, D-28213 Bremen
http://www.thetaphi.de
eMail: u...@thetaphi.de

> -----Original Message-----
> From: Michael McCandless [mailto:luc...@mikemccandless.com]
> Sent: Tuesday, March 27, 2012 8:13 PM
> To: java-user@lucene.apache.org
> Subject: Re: TVD, TVX and TVF files
> 
> The code seems OK on quick glance...
> 
> Are you closing the writer?
> 
> Are you hitting any exceptions?
> 
> Mike McCandless
> 
> http://blog.mikemccandless.com
> 
> On Tue, Mar 27, 2012 at 12:19 PM, Luis Paiva <luismpa...@mail.telepac.pt>
> wrote:
> > Hey all,
> >
> > i'm in my first steps in Lucene.
> > I was trying to index some txt files, and my program doesn't construct
> > the term vector files. I would need these files. (.tvd, .tvx, .tvf)
> >
> > I'm attaching my code so anyone can help me.
> > Thank you all in advance!
> >
> > Sorry if i'm repeating the question, but i couldn't find the answer to
it.
> >
> >
> > public void indexFileOrDirectory(String fileName) throws IOException {
> >
> >    addFiles(new File(fileName));
> >
> >    int originalNumDocs = writer.numDocs();
> >    for (File f : queue) {
> >      FileReader fr = null;
> >      try {
> >        Document doc = new Document();
> >
> >        fr = new FileReader(f);
> >        doc.add(new Field("contents", fr));
> >
> >        doc.add(new Field("path", fileName, Field.Store.YES,
> > Field.Index.NOT_ANALYZED));
> >
> >        String xpto = "xpto1 xpto2 xpto3";
> >        doc.add(new Field("contents2", xpto, Field.Store.YES,
> > Field.Index.ANALYZED, Field.TermVector.YES));
> >
> >        writer.addDocument(doc);
> >        System.out.println("Added: " + f);
> >      } catch (Exception e) {
> >        System.out.println("Could not add: " + f);
> >      } finally {
> >        fr.close();
> >      }
> >
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
> > For additional commands, e-mail: java-user-h...@lucene.apache.org
> >
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
> For additional commands, e-mail: java-user-h...@lucene.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail: java-user-h...@lucene.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org
For additional commands, e-mail: java-user-h...@lucene.apache.org

RE: TVD, TVX and TVF files

Reply via email to