Juho Mäkinen wrote:
I did some research and I traced the problem to be somewhere inside
HttpRequest of protocol-httpclient.
If you enabled the PDF parser, the version of PDFBox that is currently
in SVN is known to be broken - for some PDFs a bug in CMap handling can
cause an endless loop. Please download the latest binary from
http://www.pdfbox.org/dist , and try again.
I didn't commit the latest PDFBox, because it's unreleased yet. As soon
as there is a new release I'll update the one in our SVN. Until then you
need to follow the above procedure.
I attached also a simple tool to create fetchlists based on a list of
arbitrary URLs. This comes handy if you want to test various parts of
Nutch with arbitrary URLs, not coming from the DB.
--
Best regards,
Andrzej Bialecki <><
___. ___ ___ ___ _ _ __________________________________
[__ || __|__/|__||\/| Information Retrieval, Semantic Web
___|||__|| \| || | Embedded Unix, System Integration
http://www.sigram.com Contact: info at sigram dot com
package org.apache.nutch.tools;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import org.apache.nutch.db.Page;
import org.apache.nutch.fs.NutchFileSystem;
import org.apache.nutch.io.ArrayFile;
import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.segment.SegmentWriter;
/**
* This simple tool creates a fetchlist out of plain-text URL listing.
*
* @author Andrzej Bialecki <[EMAIL PROTECTED]>
*/
public class FreeFetchlistTool {
private NutchFileSystem nfs = null;
private String list = null;
private String outdir = null;
private boolean inSegment = false;
/**
* Create the tool instance.
*
* @param nfs filesystem for the target fetchlist
* @param list filename of the URL list, containing one URL per line
* @param outdir output directory, where the 'fetchlist' subdir will be created
* @param inSegment if true, a subdirectory named like a normal segment name will
* be created first inside the outdir, and then a 'fetchlist' subdirectory will be
* created there. This creates the same structure as [EMAIL PROTECTED] FetchListTool} does.
*/
public FreeFetchlistTool(NutchFileSystem nfs, String list, String outdir, boolean inSegment) {
this.nfs = nfs;
this.list = list;
this.outdir = outdir;
this.inSegment = inSegment;
}
/**
* Write out the fetchlist.
*
* @throws Exception
*/
public void run() throws Exception {
File out = new File(outdir);
nfs.mkdirs(out);
if (inSegment) {
out = new File(out, SegmentWriter.getNewSegmentName());
nfs.mkdirs(out);
}
out = new File(out, FetchListEntry.DIR_NAME);
ArrayFile.Writer listWriter = new ArrayFile.Writer(nfs, out.toString(), FetchListEntry.class);
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(list), "UTF-8"));
String line = null;
while ((line = br.readLine()) != null) {
FetchListEntry fle = new FetchListEntry(true,
new Page(line, 1.0f), new String[0]);
listWriter.append(fle);
}
listWriter.close();
br.close();
}
public static void main(String[] args) throws Exception {
if (args.length < 4) {
usage();
System.exit(-1);
}
NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
String outdir = null;
String list = null;
boolean segment = false;
for (int i = 0; i < args.length; i++) {
if (args[i] == null) continue;
if (args[i].equals("-o")) {
outdir = args[++i];
} else if (args[i].equals("-urllist")) {
list = args[++i];
} else if (args[i].equals("-segment")) {
segment = true;
} else {
System.err.println("Unknown argument: " + args[i]);
usage();
System.exit(-1);
}
}
FreeFetchlistTool fftl = new FreeFetchlistTool(nfs, list, outdir, segment);
fftl.run();
}
private static void usage() {
System.err.println("FreeFetchlistTool -urllist file_name -o dir_name [-segment]");
System.err.println("\n\t-urllist file_name\tfile with list of URLs, one per line");
System.err.println("\t-o dir_name\tname of the output directory.");
System.err.println("\t\t\tA 'fetchlist' subdir will be created there.");
System.err.println("\t-segment\t\tcreate a subdir first, which follows normal segment naming.");
}
}