Hi,
I have experienced problems when saving documents to file. The
domain I was trying to crawl was "http://www.dn.no/finans/". This is
what I ended up with:
///////////////////// START //////////////////////////
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License,
Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.droids.handle;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Handler;
/**
* Handler which is writing the stream to the file system.
* <p>
* Before using make sure you have set the export directory
* {...@link #setOutputDir(String outputDir)} and whether you want to
use the host
* as prefix {...@link #setIncludeHost(boolean includeHost)}.
*
* @version 1.0
*
*/
public class Save extends WriterHandler implements Handler {
private String outputDir = null;
private boolean includeHost = false;
protected int bufferSize=8192;
public void handle(URI uri, ContentEntity entity) throws
IOException {
InputStream instream = entity.obtainContent();
try {
writeOutput(uri, instream);
} finally {
instream.close();
}
}
private void writeOutput(URI uri, InputStream stream) throws
IOException {
byte[] buffer = new byte[bufferSize];
int length = stream.read(buffer);
String file = outputDir;
String path = uri.getPath();
if (path.equals(uri.getHost()) || path.matches(".*/\\w+")) {
path += "/<ROOT>.html";
}
if(path.endsWith("/"))
path += "<ROOT>.html";
if (includeHost) {
file += uri.getHost() + path;
} else {
file += path.substring(1);
}
log.info("Trying to save " + uri + " to " + file);
File cache = new File(file);
createFile(cache);
OutputStream output = new BufferedOutputStream(new
FileOutputStream(cache));
while (length > -1) {
output.write(buffer, 0, length);
length = stream.read(buffer);
}
output.flush();
output.close();
}
private static void createFile(File cache) throws IOException {
if (!cache.isDirectory() && !
cache.getAbsolutePath().endsWith("/")) {
try {
cache.createNewFile();
} catch (IOException e) {
// if we cannot create a file that means that the parent path
// does not exists
File path = new File(cache.getParent());
path.mkdirs();
cache.createNewFile();
}
}
}
/**
* Get the directory where we want to save the stream.
*
* @return directory where we want to save the stream.
*/
public String getOutputDir() {
return outputDir;
}
/**
* Set the directory where we want to save the stream.
*
* @param outputDir
* the directory where we want to save the stream.
*/
public void setOutputDir(String outputDir) {
this.outputDir = outputDir;
}
/**
* Do we want to prefix the export dir with the host name.
*
* @return true if we want to use the prefix; false otherwise
*/
public boolean isIncludeHost() {
return includeHost;
}
/**
* Do we want to prefix the export dir with the host name.
*
* @param includeHost
* true if we want to use the prefix; false otherwise
*/
public void setIncludeHost(boolean includeHost) {
this.includeHost = includeHost;
}
}
/////////////////////// END //////////////////////////
What i did was sending in the URI (as it makes it thread safe) and
handling documents that is part of a document hierarchy by adding
<ROOT.html>. I'm not sure that it is 100% robust but it is better.
Keep ut the good work!
BR
Stein K