Hi,

There's a somewhat serious issue in the WARC-generating code: on some platforms (presumably the ones where off_t is not a 64-bit number) the Content-Length header at the top of each WARC record has an incorrect length. On these platforms it is sometimes 0, sometimes 1, but never the correct length. This makes the whole WARC file unreadable.

The code works fine on many platforms, but it is apparently a problem on some PowerPC and ARM systems, and maybe other systems as well.

Existing WARC files with this problem can be repaired by replacing the value of the Content-Length header with the correct value, for each WARC record in the file. The content of the WARC records is there, it's just the Content-Length header that is wrong.

The attached patch fixes the problem in warc.c. It replaces off_t by wgint and uses the number_to_static_string function from util.c.

Regards,

Gijs
commit 66c0595f5440b36afb7307d4cab3d6430254183b
Author: Gijs van Tulder <gvtul...@gmail.com>
Date:   Mon Nov 12 22:03:30 2012 +0100

    Fix for invalid WARC Content-Length header on some platforms.

diff --git a/src/ChangeLog b/src/ChangeLog
index ec78fe8..3901d94 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,10 @@
+2012-11-12  Gijs van Tulder  <gvtul...@gmail.com>
+
+	* warc.c: Fix for invalid Content-Length WARC header on platforms
+	where off_t is less than 64 bits wide.
+	* warc.h: Likewise: Use wgint instead of off_t.
+	* http.c: Likewise.
+
 2012-08-29  Rohit Mathulla <rohit_mathu...@yahoo.com> (tiny change)
 
 	* html-url.c (get_urls_file): Convert shorthand URLs.
diff --git a/src/http.c b/src/http.c
index 5888474..52cbe87 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1712,7 +1712,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
   char warc_timestamp_str [21];
   char warc_request_uuid [48];
   ip_address *warc_ip = NULL;
-  off_t warc_payload_offset = -1;
+  wgint warc_payload_offset = -1;
 
   /* Whether this connection will be kept alive after the HTTP request
      is done. */
diff --git a/src/warc.c b/src/warc.c
index de99bf7..894b802 100644
--- a/src/warc.c
+++ b/src/warc.c
@@ -78,10 +78,10 @@ static FILE *warc_current_file;
 static gzFile warc_current_gzfile;
 
 /* The offset of the current gzip record in the WARC file. */
-static off_t warc_current_gzfile_offset;
+static wgint warc_current_gzfile_offset;
 
 /* The uncompressed size (so far) of the current record. */
-static off_t warc_current_gzfile_uncompressed_size;
+static wgint warc_current_gzfile_uncompressed_size;
 # endif
 
 /* This is true until a warc_write_* method fails. */
@@ -247,7 +247,9 @@ warc_write_block_from_file (FILE *data_in)
   /* Add the Content-Length header. */
   char *content_length;
   fseeko (data_in, 0L, SEEK_END);
-  if (! asprintf (&content_length, "%ld", ftello (data_in)))
+  wgint bytes = ftello (data_in);
+  int ret = asprintf (&content_length, "%s", number_to_static_string (bytes));
+  if (ret < 0)
     {
       warc_write_ok = false;
       return false;
@@ -313,9 +315,9 @@ warc_write_end_record (void)
       */
 
       /* Calculate the uncompressed and compressed sizes. */
-      off_t current_offset = ftello (warc_current_file);
-      off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
-      off_t compressed_size = warc_current_gzfile_uncompressed_size;
+      wgint current_offset = ftello (warc_current_file);
+      wgint uncompressed_size = current_offset - warc_current_gzfile_offset;
+      wgint compressed_size = warc_current_gzfile_uncompressed_size;
 
       /* Go back to the static GZIP header. */
       fseeko (warc_current_file, warc_current_gzfile_offset
@@ -414,14 +416,14 @@ warc_write_ip_header (ip_address *ip)
    16 bytes beginning ad RES_PAYLOAD.  */
 static int
 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
-                               off_t payload_offset)
+                               wgint payload_offset)
 {
 #define BLOCKSIZE 32768
 
   struct sha1_ctx ctx_block;
   struct sha1_ctx ctx_payload;
-  off_t pos;
-  off_t sum;
+  wgint pos;
+  wgint sum;
 
   char *buffer = malloc (BLOCKSIZE + 72);
   if (!buffer)
@@ -440,7 +442,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
          computation function processes the whole buffer so that with the
          next round of the loop another block can be read.  */
-      off_t n;
+      wgint n;
       sum = 0;
 
       /* Read block.  Take care for partial reads.  */
@@ -481,7 +483,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
       if (payload_offset >= 0 && payload_offset < pos)
         {
           /* At least part of the buffer contains data from payload. */
-          off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
+          wgint start_of_payload = payload_offset - (pos - BLOCKSIZE);
           if (start_of_payload <= 0)
             /* All bytes in the buffer belong to the payload. */
             start_of_payload = 0;
@@ -506,7 +508,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
       if (payload_offset >= 0 && payload_offset < pos)
         {
           /* At least part of the buffer contains data from payload. */
-          off_t start_of_payload = payload_offset - (pos - sum);
+          wgint start_of_payload = payload_offset - (pos - sum);
           if (start_of_payload <= 0)
             /* All bytes in the buffer belong to the payload. */
             start_of_payload = 0;
@@ -1170,7 +1172,7 @@ warc_tempfile (void)
    Returns true on success, false on error. */
 bool
 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
-                           ip_address *ip, FILE *body, off_t payload_offset)
+                           ip_address *ip, FILE *body, wgint payload_offset)
 {
   warc_write_start_record ();
   warc_write_header ("WARC-Type", "request");
@@ -1205,7 +1207,7 @@ static bool
 warc_write_cdx_record (const char *url, const char *timestamp_str,
                        const char *mime_type, int response_code,
                        const char *payload_digest, const char *redirect_location,
-                       off_t offset, const char *warc_filename,
+                       wgint offset, const char *warc_filename,
                        const char *response_uuid)
 {
   /* Transform the timestamp. */
@@ -1304,7 +1306,7 @@ warc_write_revisit_record (char *url, char *timestamp_str,
 bool
 warc_write_response_record (char *url, char *timestamp_str,
                             char *concurrent_to_uuid, ip_address *ip,
-                            FILE *body, off_t payload_offset, char *mime_type,
+                            FILE *body, wgint payload_offset, char *mime_type,
                             int response_code, char *redirect_location)
 {
   char *block_digest = NULL;
@@ -1359,7 +1361,7 @@ warc_write_response_record (char *url, char *timestamp_str,
   warc_uuid_str (response_uuid);
 
   fseeko (warc_current_file, 0L, SEEK_END);
-  off_t offset = ftello (warc_current_file);
+  wgint offset = ftello (warc_current_file);
 
   warc_write_start_record ();
   warc_write_header ("WARC-Type", "response");
@@ -1408,7 +1410,7 @@ bool
 warc_write_resource_record (char *resource_uuid, const char *url,
                  const char *timestamp_str, const char *concurrent_to_uuid,
                  ip_address *ip, const char *content_type, FILE *body,
-                 off_t payload_offset)
+                 wgint payload_offset)
 {
   if (resource_uuid == NULL)
     {
diff --git a/src/warc.h b/src/warc.h
index eba640d..dca149d 100644
--- a/src/warc.h
+++ b/src/warc.h
@@ -12,12 +12,12 @@ void warc_uuid_str (char *id_str);
 FILE * warc_tempfile (void);
 
 bool warc_write_request_record (char *url, char *timestamp_str,
-  char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset);
+  char *concurrent_to_uuid, ip_address *ip, FILE *body, wgint payload_offset);
 bool warc_write_response_record (char *url, char *timestamp_str,
-  char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset,
+  char *concurrent_to_uuid, ip_address *ip, FILE *body, wgint payload_offset,
   char *mime_type, int response_code, char *redirect_location);
 bool warc_write_resource_record (char *resource_uuid, const char *url,
   const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip,
-  const char *content_type, FILE *body, off_t payload_offset);
+  const char *content_type, FILE *body, wgint payload_offset);
 
 #endif /* WARC_H */

Reply via email to