Revision: 20268
          http://sourceforge.net/p/gate/code/20268
Author:   ian_roberts
Date:     2017-09-30 22:17:03 +0000 (Sat, 30 Sep 2017)
Log Message:
-----------
Fixing some bugs with WARC handling

- it turns out some warc files (presumably non-Heritrix ones) don't include the 
space in the response header mime type, which threw our enumerator
- the WARC format stores the bytes from the server as-is - it does *not* decode 
"Transfer-Encoding: chunked", we have to do that ourselves

Modified Paths:
--------------
    gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java
    gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java
    gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java

Modified: gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java
===================================================================
--- gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java      
2017-09-30 13:34:59 UTC (rev 20267)
+++ gcp/trunk/src/gate/cloud/io/arc/ArchiveDocumentEnumerator.java      
2017-09-30 22:17:03 UTC (rev 20268)
@@ -165,6 +165,7 @@
       moveToNext();
     }
     else {
+      logger.warn("No entries in archive");
       reader.close();
     }
   }
@@ -187,10 +188,12 @@
   }
   
   protected void moveToNext() {
+    logger.debug("moveToNext: archiveIterator = " + archiveIterator);
     while(archiveIterator != null && archiveIterator.hasNext()) {
       try {
         ArchiveRecord record = nextRecord(archiveIterator);
         if(record == null) {
+          logger.debug("Got a null record from iterator");
           // skip this record
           continue;
         }
@@ -198,6 +201,7 @@
         long recordLength = record.getHeader().getLength();
         int recordBodyLength = (int)(recordLength - recordContentBegin);
         // ignore zero-length records
+        logger.debug("Found archive record total length: " + recordLength + ", 
content begin: " + recordContentBegin + ", body length: " + recordBodyLength);
         if(recordBodyLength > 0) {
           String statusCode = statusCode(record);
           if(statusCode == null) {
@@ -222,9 +226,16 @@
                 attrs.put(ArchiveInputHandler.RECORD_POSITION_ATTR, 
                   Long.toString(inputSequence));
                 next = new DocumentID(record.getHeader().getUrl(), attrs);
+                logger.debug("Found valid ID " + next);
                 return;
+              } else {
+                logger.debug("Not an interesting mime type");
               }
+            } else {
+              logger.debug("Status code " + statusCode + " matched by 
excludes");
             }
+          } else {
+            logger.debug("Status code " + statusCode + " not matched by 
includes");
           }
         }
       } finally {

Modified: gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java
===================================================================
--- gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java    2017-09-30 
13:34:59 UTC (rev 20267)
+++ gcp/trunk/src/gate/cloud/io/arc/ArchiveInputHandler.java    2017-09-30 
22:17:03 UTC (rev 20268)
@@ -25,6 +25,7 @@
 import gate.cloud.util.ByteArrayURLStreamHandler;
 import gate.util.GateException;
 
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -41,6 +42,7 @@
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.util.DateParseException;
 import org.apache.commons.httpclient.util.DateUtil;
+import org.apache.commons.httpclient.ChunkedInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.output.ByteArrayOutputStream;
 import org.apache.log4j.Logger;
@@ -79,6 +81,8 @@
   private static final String HTTP_HEADER_PREFIX = "http_header_";
   
   private static final String HTTP_CONTENT_TYPE_HEADER_NAME = "Content-Type";
+
+  private static final String HTTP_TRANSFER_ENCODING_HEADER_NAME = 
"Transfer-Encoding";
   
   /**
    * Name for an attribute used when generating {@link DocumentID} values for
@@ -260,6 +264,7 @@
       
       String encoding = null;
       Header[] httpHeaders = httpHeaders(record);
+      boolean isChunked = false;
       Pattern charsetPattern = 
Pattern.compile("charset=(['\"]?)([a-zA-Z0-9_-]+)\\1");
       for(Header aHeader : httpHeaders) {
         if(aHeader.getName().equalsIgnoreCase(HTTP_CONTENT_TYPE_HEADER_NAME)){
@@ -266,11 +271,23 @@
           Matcher m = charsetPattern.matcher(aHeader.getValue());
           if(m.find()) {
             encoding = m.group(2);
-            break;
           }
+        } else 
if(aHeader.getName().equalsIgnoreCase(HTTP_TRANSFER_ENCODING_HEADER_NAME)) {
+          if("chunked".equalsIgnoreCase(aHeader.getValue())) {
+            isChunked = true;
+          }
         }
       }
       if(encoding == null) encoding = defaultEncoding;
+
+      if(isChunked) {
+        // de-chunk the stream
+        ChunkedInputStream chunkIn = new ChunkedInputStream(new 
ByteArrayInputStream(content));
+        baos = new ByteArrayOutputStream();
+        IOUtils.copy(chunkIn, baos);
+        chunkIn.close();
+        content = baos.toByteArray();
+      }
       
       URL docUrl = new URL(null, header.getUrl(), new 
ByteArrayURLStreamHandler(content, httpHeaders));
       

Modified: gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java
===================================================================
--- gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java 2017-09-30 
13:34:59 UTC (rev 20267)
+++ gcp/trunk/src/gate/cloud/io/arc/WARCDocumentEnumerator.java 2017-09-30 
22:17:03 UTC (rev 20268)
@@ -13,6 +13,7 @@
 
 import java.io.IOException;
 import java.util.Iterator;
+import java.util.regex.Pattern;
 
 import org.apache.commons.httpclient.Header;
 import org.apache.log4j.Logger;
@@ -29,6 +30,9 @@
   
   private static Logger logger = 
Logger.getLogger(WARCDocumentEnumerator.class);
 
+  // some non-Heritrix-produced WARC files have slightly different spacing in 
the mime type
+  protected static final Pattern HTTP_RESPONSE_MIMETYPE_PATTERN = 
Pattern.compile("(?i)application/http;\\s*msgtype=response");
+
   @Override
   protected ArchiveReader createReader() throws IOException {
     return WARCReaderFactory.get(srcFile);
@@ -37,7 +41,8 @@
   @Override
   protected ArchiveRecord nextRecord(Iterator<ArchiveRecord> it) {
     WARCRecord record = (WARCRecord)it.next();
-    
if(!WARCRecord.HTTP_RESPONSE_MIMETYPE.equals(record.getHeader().getMimetype())) 
{
+    
if(!HTTP_RESPONSE_MIMETYPE_PATTERN.matcher(record.getHeader().getMimetype()).matches())
 {
+      logger.debug("WARC record mimetype was " + 
record.getHeader().getMimetype() + ", ignored");
       return null;
     }
     try {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to