Juho Mäkinen wrote:
I did some research and I traced the problem to be somewhere inside HttpRequest of protocol-httpclient.
If you enabled the PDF parser, the version of PDFBox that is currently in SVN is known to be broken - for some PDFs a bug in CMap handling can cause an endless loop. Please download the latest binary from http://www.pdfbox.org/dist , and try again.
I didn't commit the latest PDFBox, because it's unreleased yet. As soon as there is a new release I'll update the one in our SVN. Until then you need to follow the above procedure.
I attached also a simple tool to create fetchlists based on a list of arbitrary URLs. This comes handy if you want to test various parts of Nutch with arbitrary URLs, not coming from the DB.
-- Best regards, Andrzej Bialecki <>< ___. ___ ___ ___ _ _ __________________________________ [__ || __|__/|__||\/| Information Retrieval, Semantic Web ___|||__|| \| || | Embedded Unix, System Integration http://www.sigram.com Contact: info at sigram dot com
package org.apache.nutch.tools; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import org.apache.nutch.db.Page; import org.apache.nutch.fs.NutchFileSystem; import org.apache.nutch.io.ArrayFile; import org.apache.nutch.pagedb.FetchListEntry; import org.apache.nutch.segment.SegmentWriter; /** * This simple tool creates a fetchlist out of plain-text URL listing. * * @author Andrzej Bialecki <[EMAIL PROTECTED]> */ public class FreeFetchlistTool { private NutchFileSystem nfs = null; private String list = null; private String outdir = null; private boolean inSegment = false; /** * Create the tool instance. * * @param nfs filesystem for the target fetchlist * @param list filename of the URL list, containing one URL per line * @param outdir output directory, where the 'fetchlist' subdir will be created * @param inSegment if true, a subdirectory named like a normal segment name will * be created first inside the outdir, and then a 'fetchlist' subdirectory will be * created there. This creates the same structure as [EMAIL PROTECTED] FetchListTool} does. */ public FreeFetchlistTool(NutchFileSystem nfs, String list, String outdir, boolean inSegment) { this.nfs = nfs; this.list = list; this.outdir = outdir; this.inSegment = inSegment; } /** * Write out the fetchlist. * * @throws Exception */ public void run() throws Exception { File out = new File(outdir); nfs.mkdirs(out); if (inSegment) { out = new File(out, SegmentWriter.getNewSegmentName()); nfs.mkdirs(out); } out = new File(out, FetchListEntry.DIR_NAME); ArrayFile.Writer listWriter = new ArrayFile.Writer(nfs, out.toString(), FetchListEntry.class); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(list), "UTF-8")); String line = null; while ((line = br.readLine()) != null) { FetchListEntry fle = new FetchListEntry(true, new Page(line, 1.0f), new String[0]); listWriter.append(fle); } listWriter.close(); br.close(); } public static void main(String[] args) throws Exception { if (args.length < 4) { usage(); System.exit(-1); } NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0); String outdir = null; String list = null; boolean segment = false; for (int i = 0; i < args.length; i++) { if (args[i] == null) continue; if (args[i].equals("-o")) { outdir = args[++i]; } else if (args[i].equals("-urllist")) { list = args[++i]; } else if (args[i].equals("-segment")) { segment = true; } else { System.err.println("Unknown argument: " + args[i]); usage(); System.exit(-1); } } FreeFetchlistTool fftl = new FreeFetchlistTool(nfs, list, outdir, segment); fftl.run(); } private static void usage() { System.err.println("FreeFetchlistTool -urllist file_name -o dir_name [-segment]"); System.err.println("\n\t-urllist file_name\tfile with list of URLs, one per line"); System.err.println("\t-o dir_name\tname of the output directory."); System.err.println("\t\t\tA 'fetchlist' subdir will be created there."); System.err.println("\t-segment\t\tcreate a subdir first, which follows normal segment naming."); } }