Revision: 20268
http://sourceforge.net/p/gate/code/20268
Author: ian_roberts
Date: 2017-09-30 22:17:03 +0000 (Sat, 30 Sep 2017)
Log Message:
-----------
Fixing some bugs with WARC handling
- it turns out some warc files (presumably non-Heritrix ones) don't include the
space in the response header mime type, which threw our enumerator
- the WARC format stores the bytes from the server as-is - it does *not* decode
"Transfer-Encoding: chunked", we have to do that ourselves
Modified Paths:
--------------
gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java
gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java
gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java
Modified: gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java
===================================================================
--- gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java
2017-09-30 13:34:59 UTC (rev 20267)
+++ gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java
2017-09-30 22:17:03 UTC (rev 20268)
@@ -165,6 +165,7 @@
moveToNext();
}
else {
+ logger.warn("No entries in archive");
reader.close();
}
}
@@ -187,10 +188,12 @@
}
protected void moveToNext() {
+ logger.debug("moveToNext: archiveIterator = " + archiveIterator);
while(archiveIterator != null && archiveIterator.hasNext()) {
try {
ArchiveRecord record = nextRecord(archiveIterator);
if(record == null) {
+ logger.debug("Got a null record from iterator");
// skip this record
continue;
}
@@ -198,6 +201,7 @@
long recordLength = record.getHeader().getLength();
int recordBodyLength = (int)(recordLength - recordContentBegin);
// ignore zero-length records
+ logger.debug("Found archive record total length: " + recordLength + ",
content begin: " + recordContentBegin + ", body length: " + recordBodyLength);
if(recordBodyLength > 0) {
String statusCode = statusCode(record);
if(statusCode == null) {
@@ -222,9 +226,16 @@
attrs.put(ArchiveInputHandler.RECORD_POSITION_ATTR,
Long.toString(inputSequence));
next = new DocumentID(record.getHeader().getUrl(), attrs);
+ logger.debug("Found valid ID " + next);
return;
+ } else {
+ logger.debug("Not an interesting mime type");
}
+ } else {
+ logger.debug("Status code " + statusCode + " matched by
excludes");
}
+ } else {
+ logger.debug("Status code " + statusCode + " not matched by
includes");
}
}
} finally {
Modified: gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java
===================================================================
--- gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java 2017-09-30
13:34:59 UTC (rev 20267)
+++ gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java 2017-09-30
22:17:03 UTC (rev 20268)
@@ -25,6 +25,7 @@
import gate.cloud.util.ByteArrayURLStreamHandler;
import gate.util.GateException;
+import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -41,6 +42,7 @@
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.util.DateParseException;
import org.apache.commons.httpclient.util.DateUtil;
+import org.apache.commons.httpclient.ChunkedInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.log4j.Logger;
@@ -79,6 +81,8 @@
private static final String HTTP_HEADER_PREFIX = "http_header_";
private static final String HTTP_CONTENT_TYPE_HEADER_NAME = "Content-Type";
+
+ private static final String HTTP_TRANSFER_ENCODING_HEADER_NAME =
"Transfer-Encoding";
/**
* Name for an attribute used when generating {@link DocumentID} values for
@@ -260,6 +264,7 @@
String encoding = null;
Header[] httpHeaders = httpHeaders(record);
+ boolean isChunked = false;
Pattern charsetPattern =
Pattern.compile("charset=(['\"]?)([a-zA-Z0-9_-]+)\\1");
for(Header aHeader : httpHeaders) {
if(aHeader.getName().equalsIgnoreCase(HTTP_CONTENT_TYPE_HEADER_NAME)){
@@ -266,11 +271,23 @@
Matcher m = charsetPattern.matcher(aHeader.getValue());
if(m.find()) {
encoding = m.group(2);
- break;
}
+ } else
if(aHeader.getName().equalsIgnoreCase(HTTP_TRANSFER_ENCODING_HEADER_NAME)) {
+ if("chunked".equalsIgnoreCase(aHeader.getValue())) {
+ isChunked = true;
+ }
}
}
if(encoding == null) encoding = defaultEncoding;
+
+ if(isChunked) {
+ // de-chunk the stream
+ ChunkedInputStream chunkIn = new ChunkedInputStream(new
ByteArrayInputStream(content));
+ baos = new ByteArrayOutputStream();
+ IOUtils.copy(chunkIn, baos);
+ chunkIn.close();
+ content = baos.toByteArray();
+ }
URL docUrl = new URL(null, header.getUrl(), new
ByteArrayURLStreamHandler(content, httpHeaders));
Modified: gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java
===================================================================
--- gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java 2017-09-30
13:34:59 UTC (rev 20267)
+++ gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java 2017-09-30
22:17:03 UTC (rev 20268)
@@ -13,6 +13,7 @@
import java.io.IOException;
import java.util.Iterator;
+import java.util.regex.Pattern;
import org.apache.commons.httpclient.Header;
import org.apache.log4j.Logger;
@@ -29,6 +30,9 @@
private static Logger logger =
Logger.getLogger(WARCDocumentEnumerator.class);
+ // some non-Heritrix-produced WARC files have slightly different spacing in
the mime type
+ protected static final Pattern HTTP_RESPONSE_MIMETYPE_PATTERN =
Pattern.compile("(?i)application/http;\\s*msgtype=response");
+
@Override
protected ArchiveReader createReader() throws IOException {
return WARCReaderFactory.get(srcFile);
@@ -37,7 +41,8 @@
@Override
protected ArchiveRecord nextRecord(Iterator<ArchiveRecord> it) {
WARCRecord record = (WARCRecord)it.next();
-
if(!WARCRecord.HTTP_RESPONSE_MIMETYPE.equals(record.getHeader().getMimetype()))
{
+
if(!HTTP_RESPONSE_MIMETYPE_PATTERN.matcher(record.getHeader().getMimetype()).matches())
{
+ logger.debug("WARC record mimetype was " +
record.getHeader().getMimetype() + ", ignored");
return null;
}
try {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs