indexer NutchSimilarity.java,NONE,1.1 DeleteDuplicates.java,NONE,1.1 IndexSegment.java,NONE,1.1 package.html,NONE,1.1 IndexOptimizer.java,NONE,1.1 IndexMerger.java,NONE,1.1 HighFreqTerms.java,NONE,1.1

joa23 Thu, 29 Jan 2004 07:52:35 -0800

Update of /cvsroot/nutch/playground/src/java/net/nutch/indexer
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10313/src/java/net/nutch/indexer


Added Files:
        NutchSimilarity.java DeleteDuplicates.java IndexSegment.java 
        package.html IndexOptimizer.java IndexMerger.java 
        HighFreqTerms.java 
Log Message:
intial commit

--- NEW FILE: NutchSimilarity.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.indexer;

import org.apache.lucene.search.DefaultSimilarity;

/** Similarity implementatation used by Nutch indexing and search. */
public class NutchSimilarity extends DefaultSimilarity  {
  private static final int MIN_CONTENT_LENGTH = 1000;

/** Normalize field by length. */
  public float lengthNorm(String fieldName, int numTokens) {
    if ("url".equals(fieldName)) {                // URL: prefer short
      return 1.0f / numTokens;                    // use linear normalization
      
    } else if ("content".equals(fieldName)) {     // Content: penalize short
      return super.lengthNorm(fieldName,          // treat short as longer
                              Math.max(numTokens, MIN_CONTENT_LENGTH));

    } else {                                      // Anchor: use default
      return super.lengthNorm(fieldName, numTokens);
    }
  }
}

--- NEW FILE: DeleteDuplicates.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.indexer;

import net.nutch.io.*;
import net.nutch.util.LogFormatter;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.document.Document;

import java.io.*;
import java.util.logging.Logger;
import java.security.MessageDigest;

/** Deletes duplicate documents in a set of Lucene indexes.
 * Duplicates have either the same contents (via MD5 hash) or the same URL.
 */
public class DeleteDuplicates {
  private static final Logger LOG =
    LogFormatter.getLogger("net.nutch.indexer.DeleteDuplicates");

  /** The key used in sorting for duplicates. */
  public static class IndexedDoc implements WritableComparable {
    private MD5Hash hash = new MD5Hash();
    private float score;
    private int index;                            // the segment index
    private int doc;                              // within the index

    public void write(DataOutput out) throws IOException {
      hash.write(out);
      out.writeFloat(score);
      out.writeInt(index);
      out.writeInt(doc);
    }

    public void readFields(DataInput in) throws IOException {
      hash.readFields(in);
      this.score = in.readFloat();
      this.index = in.readInt();
      this.doc = in.readInt();
    }

    public int compareTo(Object o) {
      throw new RuntimeException("this is never used");
    }

    /** Order equal hashes by decreasing score. */
    public static class ByHashScore extends WritableComparator {
      public ByHashScore() { super(IndexedDoc.class); }
      
      public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
        int c = compareBytes(b1, s1, MD5Hash.MD5_LEN, b2, s2, MD5Hash.MD5_LEN);
        if (c != 0)
          return c;

        float thisScore = readFloat(b1, s1+MD5Hash.MD5_LEN);
        float thatScore = readFloat(b2, s2+MD5Hash.MD5_LEN);
        return (thisScore<thatScore ? 1 : (thisScore==thatScore ? 0 : -1));
      }
    }

    /** Order equal hashes by decreasing index and document. */
    public static class ByHashDoc extends WritableComparator {
      public ByHashDoc() { super(IndexedDoc.class); }
      
      public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
        int c = compareBytes(b1, s1, MD5Hash.MD5_LEN, b2, s2, MD5Hash.MD5_LEN);
        if (c != 0)
          return c;

        int thisIndex = readInt(b1, s1+MD5Hash.MD5_LEN+4);
        int thatIndex = readInt(b2, s2+MD5Hash.MD5_LEN+4);

        if (thisIndex != thatIndex)
          return thatIndex - thisIndex;

        int thisDoc = readInt(b1, s1+MD5Hash.MD5_LEN+8);
        int thatDoc = readInt(b2, s2+MD5Hash.MD5_LEN+8);

        return thatDoc - thisDoc;
      }
    }
  }

  private interface Hasher {
    void updateHash(MD5Hash hash, Document doc);
  }

  private IndexReader[] readers;
  private String tempFile;

  /** Constructs a duplicate detector for the provided indexes. */
  public DeleteDuplicates(IndexReader[] readers, String tempFile) {
    this.readers = readers;
    this.tempFile = tempFile;
  }

  /** Closes the indexes, saving changes. */
  public void close() throws IOException {
    for (int i = 0; i < readers.length; i++)
      readers[i].close();
  }

  /** Delete pages with duplicate content hashes.  Of those with the same
   * content hash, keep the page with the highest score. */
  public void deleteContentDuplicates() throws IOException {
    LOG.info("Reading content hashes...");
    computeHashes(new Hasher() {
        public void updateHash(MD5Hash hash, Document doc) {
          hash.setDigest(doc.get("digest"));
        }
      });

    LOG.info("Sorting content hashes...");
    SequenceFile.Sorter byHashScoreSorter =
      new SequenceFile.Sorter(new IndexedDoc.ByHashScore(),NullWritable.class);
    byHashScoreSorter.sort(tempFile, tempFile + ".sorted");
    
    LOG.info("Deleting content duplicates...");
    int duplicateCount = deleteDuplicates();
    LOG.info("Deleted " + duplicateCount + " content duplicates.");
  }

  /** Delete pages with duplicate URLs.  Of those with the same
   * URL, keep the most recently fetched page. */
  public void deleteUrlDuplicates() throws IOException {
    final MessageDigest digest;
    try {
      digest = MessageDigest.getInstance("MD5");
    } catch (Exception e) {
      throw new RuntimeException(e.toString());
    }

    LOG.info("Reading url hashes...");
    computeHashes(new Hasher() {
        public void updateHash(MD5Hash hash, Document doc) {
          try {
            digest.update(UTF8.getBytes(doc.get("url")));
            digest.digest(hash.getDigest(), 0, MD5Hash.MD5_LEN);
          } catch (Exception e) {
            throw new RuntimeException(e.toString());
          }
        }
      });

    LOG.info("Sorting url hashes...");
    SequenceFile.Sorter byHashDocSorter =
      new SequenceFile.Sorter(new IndexedDoc.ByHashDoc(), NullWritable.class);
    byHashDocSorter.sort(tempFile, tempFile + ".sorted");
    
    LOG.info("Deleting url duplicates...");
    int duplicateCount = deleteDuplicates();
    LOG.info("Deleted " + duplicateCount + " url duplicates.");
  }

  private void computeHashes(Hasher hasher) throws IOException {
    IndexedDoc indexedDoc = new IndexedDoc();

    SequenceFile.Writer writer =
      new SequenceFile.Writer(tempFile, IndexedDoc.class, NullWritable.class);
    try {
      for (int index = 0; index < readers.length; index++) {
        IndexReader reader = readers[index];
        int readerMax = reader.maxDoc();
        indexedDoc.index = index;
        for (int doc = 0; doc < readerMax; doc++) {
          if (!reader.isDeleted(doc)) {
            Document document = reader.document(doc);
            hasher.updateHash(indexedDoc.hash, document);
            indexedDoc.score = Float.parseFloat(document.get("boost"));
            indexedDoc.doc = doc;
            writer.append(indexedDoc, NullWritable.get());
          }
        }
      }
    } finally {
      writer.close();
    }
  }

  private int deleteDuplicates() throws IOException {
    if (new File(tempFile).exists())
      new File(tempFile).delete();
    if (!new File(tempFile + ".sorted").renameTo(new File(tempFile)))
      throw new IOException("Couldn't rename!");

    IndexedDoc indexedDoc = new IndexedDoc();
    SequenceFile.Reader reader = new SequenceFile.Reader(tempFile);
    try {
      int duplicateCount = 0;
      MD5Hash prev = null;                        // previous hash
      while (reader.next(indexedDoc, NullWritable.get())) {
        if (prev == null) {                       // initialize prev
          prev = new MD5Hash();
          prev.set(indexedDoc.hash);
          continue;
        }
        if (indexedDoc.hash.equals(prev)) {        // found a duplicate
          readers[indexedDoc.index].delete(indexedDoc.doc); // delete it
          duplicateCount++;
        } else {
          prev.set(indexedDoc.hash);               // reset prev
        }
      }
      return duplicateCount;
    } finally {
      reader.close();
      new File(tempFile).delete();
    }
  }

  /** Delete duplicates in the indexes in the named directory. */
  public static void main(String[] args) throws Exception {
    String usage = "DeleteDuplicates <segmentsDir> <tempFile>";

    if (args.length != 2) {
      System.err.println("Usage: " + usage);
      return;
    } 

    String segmentsDir = args[0];
    String tempFile = args[1];

    File[] directories = new File(segmentsDir).listFiles();
    IndexReader[] readers = new IndexReader[directories.length];
    int maxDoc = 0;
    for (int i = 0; i < directories.length; i++) {
      File indexDir = new File(directories[i], "index");
      IndexReader reader = IndexReader.open(indexDir);
      if (reader.hasDeletions()) {
        LOG.info("Clearing old deletions in " + indexDir);
        reader.undeleteAll();
      }
      maxDoc += reader.maxDoc();
      readers[i] = reader;
    }

    DeleteDuplicates dd = new DeleteDuplicates(readers, tempFile);
    dd.deleteUrlDuplicates();
    dd.deleteContentDuplicates();
    dd.close();
  }
}

--- NEW FILE: IndexSegment.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.indexer;

import net.nutch.pagedb.*;
import net.nutch.linkdb.*;
import net.nutch.fetcher.*;
import net.nutch.analysis.NutchDocumentAnalyzer;
import net.nutch.db.*;
import net.nutch.io.*;
import net.nutch.util.*;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.util.logging.Logger;
import java.util.Date;
import java.io.File;
import java.io.EOFException;

/** Creates an index for the output corresponding to a single fetcher run. */
public class IndexSegment {
  public static final String DONE_NAME = "index.done";
  public static final Logger LOG =
    LogFormatter.getLogger("net.nutch.index.IndexSegment");

  private float scorePower = NutchConf.getFloat("indexer.score.power", 0.5f);

  private int maxTitleLength =
    NutchConf.getInt("indexer.max.title.length", 100);

  private File directory = null;
  private int maxDocs = Integer.MAX_VALUE;

  /** Determines the power of link analyis scores.  Each pages's boost is
   * set to <i>score<sup>scorePower</sup></i> where <i>score</i> is its link
   * analysis score and <i>scorePower</i> is the value passed to this method.
   */
  public void setScorePower(float power) { scorePower = power; }

  private void indexPages() throws Exception {
    IndexWriter writer
      = new IndexWriter(new File(directory, "index"),
                        new NutchDocumentAnalyzer(), true);
    writer.mergeFactor = 50;
    writer.infoStream = System.out;
    writer.setSimilarity(new NutchSimilarity());

    ArrayFile.Reader fetcher =
      new ArrayFile.Reader(new File(directory, FetcherOutput.DIR_NAME).toString());
    ArrayFile.Reader text =
      new ArrayFile.Reader(new File(directory,FetcherText.DIR_NAME).toString());

    int count = 0;
    try {
      String segmentName = directory.getCanonicalFile().getName();
      FetcherOutput fetcherOutput = new FetcherOutput();
      FetcherText fetcherText = new FetcherText();

      while (fetcher.next(fetcherOutput) != null && count++ < maxDocs) {
        text.next(fetcherText);
        
        if (!fetcherOutput.getSuccess())          // if the fetch failed
          continue;                               // don't index the page

        Document doc = makeDocument(segmentName, fetcher.key(),
                                    fetcherOutput, fetcherText);
        writer.addDocument(doc);
      }
    } catch (EOFException e) {
      LOG.warning("Unexpected EOF in: " + directory +
                  " at entry #" + count + ".  Ignoring.");
    } finally {
      fetcher.close();
      text.close();
    }
    System.out.println("Optimizing index...");
    writer.optimize();
    writer.close();
  }

  private Document makeDocument(String segmentName, long docNo,
                                FetcherOutput fetcherOutput,
                                FetcherText fetcherText)
    throws Exception {

    FetchListEntry fle = fetcherOutput.getFetchListEntry();
    String url = fle.getPage().getURL().toString();
    String title = fetcherOutput.getTitle();

    if (title.length() > maxTitleLength) {        // truncate title if needed
      title = title.substring(0, maxTitleLength);
    }

    Document doc = new Document();

    // url is both stored and indexed, so it's both searchable and returned
    doc.add(Field.Text("url", url));

    // un-indexed fields: not searchable, but in hits and/or used by dedup
    doc.add(Field.UnIndexed("title", title));
    doc.add(Field.UnIndexed("digest", fetcherOutput.getMD5Hash().toString()));
    doc.add(Field.UnIndexed("docNo", Long.toString(docNo, 16)));
    doc.add(Field.UnIndexed("segment", segmentName));

    // content is indexed, so that it's searchable, but not stored in index
    doc.add(Field.UnStored("content", fetcherText.getText()));
    
    // anchors are indexed, so they're searchable, but not stored in index
    String[] anchors = fle.getAnchors();
    for (int i = 0; i < anchors.length; i++) {
      doc.add(Field.UnStored("anchor", anchors[i]));
    }

    // add title as anchor so it's searchable.  doesn't warrant its own field.
    doc.add(Field.UnStored("anchor", title));

    // compute boost
    float boost = (float)Math.pow(fle.getPage().getScore(), scorePower);
    // apply boost to all indexed fields
    doc.setBoost(boost);

    // store boost for use by explain and dedup
    doc.add(Field.UnIndexed("boost", Float.toString(boost)));
    
    return doc;
  }


  /** Create an index for the input files in the named directory. */
  public static void main(String[] args) throws Exception {
      
    String usage = "IndexSegment <segment_directory>";

    if (args.length == 0) {
      System.err.println("Usage: " + usage);
      return;
    }

    IndexSegment indexer = new IndexSegment();

    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-max")) {        // parse -max option
        indexer.maxDocs = Integer.parseInt(args[++i]);
      } else if (i != args.length-1) {
        System.err.println("Usage: " + usage);
        return;
      } else {
        indexer.directory = new File(args[i]);
      }
    }

    File fetcherDone = new File(indexer.directory, FetcherOutput.DONE_NAME);
    if (!fetcherDone.exists())                    // check fetcher done file
      throw new RuntimeException("can't index--not yet fetched: " +
                                 fetcherDone + " does not exist");

    File doneFile = new File(indexer.directory, DONE_NAME);
    if (doneFile.exists())                        // check index done file
      throw new RuntimeException("already indexed: " + doneFile + " exists");

    Date start = new Date();

    indexer.indexPages();

    Date end = new Date();

    System.out.print(end.getTime() - start.getTime());
    System.out.println(" total milliseconds");

    doneFile.createNewFile();                     // create the done file
  }

}

--- NEW FILE: package.html ---
<html>
<body>
Maintain Lucene full-text indexes.
</body>
</html>

--- NEW FILE: IndexOptimizer.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.indexer;

import java.util.*;
import java.io.*;

import org.apache.lucene.util.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;

/** */
public class IndexOptimizer {
  public static final String DONE_NAME = "optimize.done";

  private static final float IDF_THRESHOLD = 6.0f;
  private static final float FRACTION = 0.1f;

  private static class FilterTermDocs implements TermDocs {
    protected TermDocs in;

    public FilterTermDocs(TermDocs in) { this.in = in; }

    public void seek(Term term) throws IOException { in.seek(term); }
    public void seek(TermEnum enum) throws IOException { in.seek(enum); }
    public int doc() { return in.doc(); }
    public int freq() { return in.freq(); }
    public boolean next() throws IOException { return in.next(); }
    public int read(int[] docs, int[] freqs) throws IOException {
      return in.read(docs, freqs);
    }
    public boolean skipTo(int i) throws IOException { return in.skipTo(i); }
    public void close() throws IOException { in.close(); } 
  }

  private static class FilterTermPositions
     extends FilterTermDocs implements TermPositions {

    public FilterTermPositions(TermPositions in) { super(in); }

    public int nextPosition() throws IOException {
      return ((TermPositions)in).nextPosition();
    }
  }

  private static class FilterTermEnum extends TermEnum {
    protected TermEnum in;

    public FilterTermEnum(TermEnum in) { this.in = in; }

    public boolean next() throws IOException { return in.next(); }
    public Term term() { return in.term(); }
    public int docFreq() { return in.docFreq(); }
    public void close() throws IOException { in.close(); }
  }

  private static class OptimizingTermEnum extends FilterTermEnum {
    private IndexReader reader;
    private Similarity similarity;

    public OptimizingTermEnum(IndexReader reader, Similarity similarity)
      throws IOException {
      super(reader.terms());
      this.reader = reader;
      this.similarity = similarity;
    }

    public boolean next() throws IOException {
      while (in.next()) {
        float idf = similarity.idf(in.docFreq(), reader.maxDoc());

        if (idf <= IDF_THRESHOLD)
          return true;
      }
      return false;
    }
  }
    
  private static class ScoreDocQueue extends PriorityQueue {
    ScoreDocQueue(int size) {
      initialize(size);
    }
    
    protected final boolean lessThan(Object a, Object b) {
      ScoreDoc hitA = (ScoreDoc)a;
      ScoreDoc hitB = (ScoreDoc)b;
      if (hitA.score == hitB.score)
        return hitA.doc > hitB.doc; 
      else
        return hitA.score < hitB.score;
    }
  }

  private static class OptimizingTermPositions extends FilterTermPositions {
    private IndexReader reader;
    private TermDocs termDocs;
    private int docFreq;
    private ScoreDocQueue sdq;
    private BitSet docs;
    private Similarity similarity;

    public OptimizingTermPositions(IndexReader reader, Similarity similarity)
      throws IOException {
      super(reader.termPositions());
      this.reader = reader;
      this.termDocs = reader.termDocs();
      this.similarity = similarity;
      this.sdq = new ScoreDocQueue((int)Math.ceil(reader.maxDoc() * FRACTION));
      this.docs = new BitSet(reader.maxDoc());
    }

    public void seek(TermEnum enum) throws IOException {
      super.seek(enum);
      termDocs.seek(enum);

      byte[] norms = reader.norms(enum.term().field());

      sdq.clear();
      float minScore = 0.0f;
      int count = (int)Math.ceil(enum.docFreq() * FRACTION);
      System.out.println("Optimizing " + enum.term()
                         + " from " + enum.docFreq() 
                         + " to " + count); 
      while (termDocs.next()) {
        int doc = termDocs.doc();
        float score =
          similarity.tf(termDocs.freq()) * similarity.decodeNorm(norms[doc]);

        if (score > minScore) {
          sdq.put(new ScoreDoc(doc, score));
          if (sdq.size() > count) {               // if sdq overfull
            sdq.pop();                            // remove lowest in sdq
            minScore = ((ScoreDoc)sdq.top()).score; // reset minScore
          }
        }
      }

      docs.clear();
      while (sdq.size() != 0) {
        docs.set(((ScoreDoc)sdq.pop()).doc);
      }

    }        
        
    public boolean next() throws IOException {
      while (in.next()) {
        if (docs.get(in.doc()))
          return true;
      }
      return false;
    }
      
  }

  private static class OptimizingReader extends FilterIndexReader {
    private Similarity similarity = new NutchSimilarity();

    
    public OptimizingReader(IndexReader reader) {
      super(reader);
    }

    // don't copy any per-document data
    public int numDocs() { return 0; }
    public int maxDoc() { return 0; }

    // filter out low frequency terms
    public TermEnum terms() throws IOException {
      return new OptimizingTermEnum(in, similarity);
    }

    // filter out low-scoring postings
    public TermPositions termPositions() throws IOException {
      return new OptimizingTermPositions(in, similarity);
    }

    public boolean hasDeletions() { return false; }
  }


  private File directory;

  public IndexOptimizer(File directory) {
    this.directory = directory;
  }

  public void optimize() throws IOException {
    IndexReader reader = IndexReader.open(new File(directory, "index"));
    OptimizingReader optimizer = new OptimizingReader(reader);
    IndexWriter writer = new IndexWriter(new File(directory, "index-opt"),
                                         null, true);
    writer.addIndexes(new IndexReader[] { optimizer });
  }

  /** */
  public static void main(String[] args) throws Exception {
    File directory;
      
    String usage = "IndexOptimizer directory";

    if (args.length < 1) {
      System.err.println("Usage: " + usage);
      return;
    }

    directory = new File(args[0]);

    IndexOptimizer optimizer = new IndexOptimizer(directory);

    Date start = new Date();

    optimizer.optimize();

    Date end = new Date();

    System.out.print(end.getTime() - start.getTime());
    System.out.println(" total milliseconds");
  }

}

--- NEW FILE: IndexMerger.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.indexer;

import java.util.Date;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.index.IndexWriter;

/** Creates an index for the output corresponding to a single fetcher run. */
public class IndexMerger {
  public static final String DONE_NAME = "merge.done";

  private File indexDirectory;
  private File[] segments;

  public IndexMerger(File indexDirectory, File[] segments) {
    this.indexDirectory = indexDirectory;
    this.segments = segments;
  }

  private void merge() throws IOException {
    Directory[] dirs = new Directory[segments.length];
    for (int i = 0; i < segments.length; i++)
      dirs[i] = FSDirectory.getDirectory(new File(segments[i],"index"), false);

    String name =
      segments[0].getName() + "_" + segments[segments.length-1].getName();
    IndexWriter writer =
      new IndexWriter(new File(indexDirectory, name), null, true);
    writer.mergeFactor = 50;
    writer.infoStream = System.out;

    writer.addIndexes(dirs);
    writer.close();
  }


  /** Create an index for the input files in the named directory. */
  public static void main(String[] args) throws Exception {
    File indexDirectory;
      
    String usage = "IndexMerger indexDirectory segments...";

    if (args.length < 2) {
      System.err.println("Usage: " + usage);
      return;
    }

    indexDirectory = new File(args[0]);

    File[] segments = new File[args.length - 1];
    for (int i = 1; i < args.length; i++) {
      segments[i-1] = new File(args[i]);
    }

    IndexMerger merger = new IndexMerger(indexDirectory, segments);

    Date start = new Date();

    merger.merge();

    Date end = new Date();

    System.out.print(end.getTime() - start.getTime());
    System.out.println(" total milliseconds");
  }

}

--- NEW FILE: HighFreqTerms.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.indexer;

import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;

import java.io.OutputStreamWriter;

/** Lists the most frequent terms in an index. */
public class HighFreqTerms {
  public static int numTerms = 100;

  private static class TermFreq {
    TermFreq(Term t, int df) {
      term = t;
      docFreq = df;
    }
    int docFreq;
    Term term;
  }

  private static class TermFreqQueue extends PriorityQueue {
    TermFreqQueue(int size) {
      initialize(size);
    }

    protected final boolean lessThan(Object a, Object b) {
      TermFreq termInfoA = (TermFreq)a;
      TermFreq termInfoB = (TermFreq)b;
      return termInfoA.docFreq < termInfoB.docFreq;
    }
  }

  public static void main(String[] args) throws Exception {
    IndexReader reader = null;
    boolean noFreqs = false;
    int count = 100;
    String usage = "HighFreqTerms [-count <n>] [-nofreqs] <index dir>";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }

    for (int i = 0; i < args.length; i++) {       // parse command line
      if (args[i].equals("-count")) {             // found -count option
        count = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-nofreqs")) {    // found -nofreqs option
        noFreqs = true;
      } else {
        reader = IndexReader.open(args[i]);
      }
    }

    TermFreqQueue tiq = new TermFreqQueue(count);
    TermEnum terms = reader.terms();
      
    int minFreq = 0;
    while (terms.next()) {
      if (terms.docFreq() > minFreq) {
        tiq.put(new TermFreq(terms.term(), terms.docFreq()));
        if (tiq.size() > count) {                 // if tiq overfull
          tiq.pop();                              // remove lowest in tiq
          minFreq = ((TermFreq)tiq.top()).docFreq; // reset minFreq
        }
      }
    }

    OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8");
    while (tiq.size() != 0) {
      TermFreq termInfo = (TermFreq)tiq.pop();
      out.write(termInfo.term.toString());
      if (!noFreqs) {
        out.write(" ");
        out.write(Integer.toString(termInfo.docFreq));
      }
      out.write("\n");
    }

    out.flush();
    reader.close();
  }

}




-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] playground/src/java/net/nutch/indexer NutchSimilarity.java,NONE,1.1 DeleteDuplicates.java,NONE,1.1 IndexSegment.java,NONE,1.1 package.html,NONE,1.1 IndexOptimizer.java,NONE,1.1 IndexMerger.java,NONE,1.1 HighFreqTerms.java,NONE,1.1

Reply via email to