Revision: 20268 http://sourceforge.net/p/gate/code/20268 Author: ian_roberts Date: 2017-09-30 22:17:03 +0000 (Sat, 30 Sep 2017) Log Message: ----------- Fixing some bugs with WARC handling
- it turns out some warc files (presumably non-Heritrix ones) don't include the space in the response header mime type, which threw our enumerator - the WARC format stores the bytes from the server as-is - it does *not* decode "Transfer-Encoding: chunked", we have to do that ourselves Modified Paths: -------------- gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java Modified: gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java =================================================================== --- gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java 2017-09-30 13:34:59 UTC (rev 20267) +++ gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java 2017-09-30 22:17:03 UTC (rev 20268) @@ -165,6 +165,7 @@ moveToNext(); } else { + logger.warn("No entries in archive"); reader.close(); } } @@ -187,10 +188,12 @@ } protected void moveToNext() { + logger.debug("moveToNext: archiveIterator = " + archiveIterator); while(archiveIterator != null && archiveIterator.hasNext()) { try { ArchiveRecord record = nextRecord(archiveIterator); if(record == null) { + logger.debug("Got a null record from iterator"); // skip this record continue; } @@ -198,6 +201,7 @@ long recordLength = record.getHeader().getLength(); int recordBodyLength = (int)(recordLength - recordContentBegin); // ignore zero-length records + logger.debug("Found archive record total length: " + recordLength + ", content begin: " + recordContentBegin + ", body length: " + recordBodyLength); if(recordBodyLength > 0) { String statusCode = statusCode(record); if(statusCode == null) { @@ -222,9 +226,16 @@ attrs.put(ArchiveInputHandler.RECORD_POSITION_ATTR, Long.toString(inputSequence)); next = new DocumentID(record.getHeader().getUrl(), attrs); + logger.debug("Found valid ID " + next); return; + } else { + logger.debug("Not an interesting mime type"); } + } else { + logger.debug("Status code " + statusCode + " matched by excludes"); } + } else { + logger.debug("Status code " + statusCode + " not matched by includes"); } } } finally { Modified: gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java =================================================================== --- gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java 2017-09-30 13:34:59 UTC (rev 20267) +++ gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java 2017-09-30 22:17:03 UTC (rev 20268) @@ -25,6 +25,7 @@ import gate.cloud.util.ByteArrayURLStreamHandler; import gate.util.GateException; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -41,6 +42,7 @@ import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.util.DateParseException; import org.apache.commons.httpclient.util.DateUtil; +import org.apache.commons.httpclient.ChunkedInputStream; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.log4j.Logger; @@ -79,6 +81,8 @@ private static final String HTTP_HEADER_PREFIX = "http_header_"; private static final String HTTP_CONTENT_TYPE_HEADER_NAME = "Content-Type"; + + private static final String HTTP_TRANSFER_ENCODING_HEADER_NAME = "Transfer-Encoding"; /** * Name for an attribute used when generating {@link DocumentID} values for @@ -260,6 +264,7 @@ String encoding = null; Header[] httpHeaders = httpHeaders(record); + boolean isChunked = false; Pattern charsetPattern = Pattern.compile("charset=(['\"]?)([a-zA-Z0-9_-]+)\\1"); for(Header aHeader : httpHeaders) { if(aHeader.getName().equalsIgnoreCase(HTTP_CONTENT_TYPE_HEADER_NAME)){ @@ -266,11 +271,23 @@ Matcher m = charsetPattern.matcher(aHeader.getValue()); if(m.find()) { encoding = m.group(2); - break; } + } else if(aHeader.getName().equalsIgnoreCase(HTTP_TRANSFER_ENCODING_HEADER_NAME)) { + if("chunked".equalsIgnoreCase(aHeader.getValue())) { + isChunked = true; + } } } if(encoding == null) encoding = defaultEncoding; + + if(isChunked) { + // de-chunk the stream + ChunkedInputStream chunkIn = new ChunkedInputStream(new ByteArrayInputStream(content)); + baos = new ByteArrayOutputStream(); + IOUtils.copy(chunkIn, baos); + chunkIn.close(); + content = baos.toByteArray(); + } URL docUrl = new URL(null, header.getUrl(), new ByteArrayURLStreamHandler(content, httpHeaders)); Modified: gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java =================================================================== --- gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java 2017-09-30 13:34:59 UTC (rev 20267) +++ gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java 2017-09-30 22:17:03 UTC (rev 20268) @@ -13,6 +13,7 @@ import java.io.IOException; import java.util.Iterator; +import java.util.regex.Pattern; import org.apache.commons.httpclient.Header; import org.apache.log4j.Logger; @@ -29,6 +30,9 @@ private static Logger logger = Logger.getLogger(WARCDocumentEnumerator.class); + // some non-Heritrix-produced WARC files have slightly different spacing in the mime type + protected static final Pattern HTTP_RESPONSE_MIMETYPE_PATTERN = Pattern.compile("(?i)application/http;\\s*msgtype=response"); + @Override protected ArchiveReader createReader() throws IOException { return WARCReaderFactory.get(srcFile); @@ -37,7 +41,8 @@ @Override protected ArchiveRecord nextRecord(Iterator<ArchiveRecord> it) { WARCRecord record = (WARCRecord)it.next(); - if(!WARCRecord.HTTP_RESPONSE_MIMETYPE.equals(record.getHeader().getMimetype())) { + if(!HTTP_RESPONSE_MIMETYPE_PATTERN.matcher(record.getHeader().getMimetype()).matches()) { + logger.debug("WARC record mimetype was " + record.getHeader().getMimetype() + ", ignored"); return null; } try { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs