Juho Mäkinen wrote:
I did some research and I traced the problem to be somewhere inside
HttpRequest of protocol-httpclient.

If you enabled the PDF parser, the version of PDFBox that is currently in SVN is known to be broken - for some PDFs a bug in CMap handling can cause an endless loop. Please download the latest binary from http://www.pdfbox.org/dist , and try again.

I didn't commit the latest PDFBox, because it's unreleased yet. As soon as there is a new release I'll update the one in our SVN. Until then you need to follow the above procedure.

I attached also a simple tool to create fetchlists based on a list of arbitrary URLs. This comes handy if you want to test various parts of Nutch with arbitrary URLs, not coming from the DB.


--
Best regards,
Andrzej Bialecki     <><
 ___. ___ ___ ___ _ _   __________________________________
[__ || __|__/|__||\/|  Information Retrieval, Semantic Web
___|||__||  \|  ||  |  Embedded Unix, System Integration
http://www.sigram.com  Contact: info at sigram dot com

package org.apache.nutch.tools;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;

import org.apache.nutch.db.Page;
import org.apache.nutch.fs.NutchFileSystem;
import org.apache.nutch.io.ArrayFile;
import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.segment.SegmentWriter;

/**
 * This simple tool creates a fetchlist out of plain-text URL listing.
 * 
 * @author Andrzej Bialecki &lt;[EMAIL PROTECTED]&gt;
 */
public class FreeFetchlistTool {

  private NutchFileSystem nfs = null;
  private String list = null;
  private String outdir = null;
  private boolean inSegment = false;
  
  /**
   * Create the tool instance.
   * 
   * @param nfs filesystem for the target fetchlist
   * @param list filename of the URL list, containing one URL per line
   * @param outdir output directory, where the 'fetchlist' subdir will be created
   * @param inSegment if true, a subdirectory named like a normal segment name will
   * be created first inside the outdir, and then a 'fetchlist' subdirectory will be
   * created there. This creates the same structure as [EMAIL PROTECTED] FetchListTool} does.
   */
  public FreeFetchlistTool(NutchFileSystem nfs, String list, String outdir, boolean inSegment) {
    this.nfs = nfs;
    this.list = list;
    this.outdir = outdir;
    this.inSegment = inSegment;
  }
  
  /**
   * Write out the fetchlist.
   * 
   * @throws Exception
   */
  public void run() throws Exception {
    File out = new File(outdir);
    nfs.mkdirs(out);
    if (inSegment) {
      out = new File(out, SegmentWriter.getNewSegmentName());
      nfs.mkdirs(out);
    }
    out =  new File(out, FetchListEntry.DIR_NAME);
    ArrayFile.Writer listWriter = new ArrayFile.Writer(nfs, out.toString(), FetchListEntry.class);
    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(list), "UTF-8"));
    String line = null;
    while ((line = br.readLine()) != null) {
      FetchListEntry fle = new FetchListEntry(true,
              new Page(line, 1.0f), new String[0]);
      listWriter.append(fle);
    }
    listWriter.close();
    br.close();
  }
  
  public static void main(String[] args) throws Exception {
    if (args.length < 4) {
      usage();
      System.exit(-1);
    }
    NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
    String outdir = null;
    String list = null;
    boolean segment = false;
    for (int i = 0; i < args.length; i++) {
      if (args[i] == null) continue;
      if (args[i].equals("-o")) {
        outdir = args[++i];
      } else if (args[i].equals("-urllist")) {
        list = args[++i];
      } else if (args[i].equals("-segment")) {
        segment = true;
      } else {
        System.err.println("Unknown argument: " + args[i]);
        usage();
        System.exit(-1);
      }
    }
    FreeFetchlistTool fftl = new FreeFetchlistTool(nfs, list, outdir, segment);
    fftl.run();
  }

  private static void usage() {
    System.err.println("FreeFetchlistTool -urllist file_name -o dir_name [-segment]");
    System.err.println("\n\t-urllist file_name\tfile with list of URLs, one per line");
    System.err.println("\t-o dir_name\tname of the output directory.");
    System.err.println("\t\t\tA 'fetchlist' subdir will be created there.");
    System.err.println("\t-segment\t\tcreate a subdir first, which follows normal segment naming.");
  }
}

Reply via email to