Hi,
There's a somewhat serious issue in the WARC-generating code: on some
platforms (presumably the ones where off_t is not a 64-bit number) the
Content-Length header at the top of each WARC record has an incorrect
length. On these platforms it is sometimes 0, sometimes 1, but never the
correct length. This makes the whole WARC file unreadable.
The code works fine on many platforms, but it is apparently a problem on
some PowerPC and ARM systems, and maybe other systems as well.
Existing WARC files with this problem can be repaired by replacing the
value of the Content-Length header with the correct value, for each WARC
record in the file. The content of the WARC records is there, it's just
the Content-Length header that is wrong.
The attached patch fixes the problem in warc.c. It replaces off_t by
wgint and uses the number_to_static_string function from util.c.
Regards,
Gijs
commit 66c0595f5440b36afb7307d4cab3d6430254183b
Author: Gijs van Tulder <gvtul...@gmail.com>
Date: Mon Nov 12 22:03:30 2012 +0100
Fix for invalid WARC Content-Length header on some platforms.
diff --git a/src/ChangeLog b/src/ChangeLog
index ec78fe8..3901d94 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,10 @@
+2012-11-12 Gijs van Tulder <gvtul...@gmail.com>
+
+ * warc.c: Fix for invalid Content-Length WARC header on platforms
+ where off_t is less than 64 bits wide.
+ * warc.h: Likewise: Use wgint instead of off_t.
+ * http.c: Likewise.
+
2012-08-29 Rohit Mathulla <rohit_mathu...@yahoo.com> (tiny change)
* html-url.c (get_urls_file): Convert shorthand URLs.
diff --git a/src/http.c b/src/http.c
index 5888474..52cbe87 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1712,7 +1712,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
char warc_timestamp_str [21];
char warc_request_uuid [48];
ip_address *warc_ip = NULL;
- off_t warc_payload_offset = -1;
+ wgint warc_payload_offset = -1;
/* Whether this connection will be kept alive after the HTTP request
is done. */
diff --git a/src/warc.c b/src/warc.c
index de99bf7..894b802 100644
--- a/src/warc.c
+++ b/src/warc.c
@@ -78,10 +78,10 @@ static FILE *warc_current_file;
static gzFile warc_current_gzfile;
/* The offset of the current gzip record in the WARC file. */
-static off_t warc_current_gzfile_offset;
+static wgint warc_current_gzfile_offset;
/* The uncompressed size (so far) of the current record. */
-static off_t warc_current_gzfile_uncompressed_size;
+static wgint warc_current_gzfile_uncompressed_size;
# endif
/* This is true until a warc_write_* method fails. */
@@ -247,7 +247,9 @@ warc_write_block_from_file (FILE *data_in)
/* Add the Content-Length header. */
char *content_length;
fseeko (data_in, 0L, SEEK_END);
- if (! asprintf (&content_length, "%ld", ftello (data_in)))
+ wgint bytes = ftello (data_in);
+ int ret = asprintf (&content_length, "%s", number_to_static_string (bytes));
+ if (ret < 0)
{
warc_write_ok = false;
return false;
@@ -313,9 +315,9 @@ warc_write_end_record (void)
*/
/* Calculate the uncompressed and compressed sizes. */
- off_t current_offset = ftello (warc_current_file);
- off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
- off_t compressed_size = warc_current_gzfile_uncompressed_size;
+ wgint current_offset = ftello (warc_current_file);
+ wgint uncompressed_size = current_offset - warc_current_gzfile_offset;
+ wgint compressed_size = warc_current_gzfile_uncompressed_size;
/* Go back to the static GZIP header. */
fseeko (warc_current_file, warc_current_gzfile_offset
@@ -414,14 +416,14 @@ warc_write_ip_header (ip_address *ip)
16 bytes beginning ad RES_PAYLOAD. */
static int
warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
- off_t payload_offset)
+ wgint payload_offset)
{
#define BLOCKSIZE 32768
struct sha1_ctx ctx_block;
struct sha1_ctx ctx_payload;
- off_t pos;
- off_t sum;
+ wgint pos;
+ wgint sum;
char *buffer = malloc (BLOCKSIZE + 72);
if (!buffer)
@@ -440,7 +442,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
/* We read the file in blocks of BLOCKSIZE bytes. One call of the
computation function processes the whole buffer so that with the
next round of the loop another block can be read. */
- off_t n;
+ wgint n;
sum = 0;
/* Read block. Take care for partial reads. */
@@ -481,7 +483,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
if (payload_offset >= 0 && payload_offset < pos)
{
/* At least part of the buffer contains data from payload. */
- off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
+ wgint start_of_payload = payload_offset - (pos - BLOCKSIZE);
if (start_of_payload <= 0)
/* All bytes in the buffer belong to the payload. */
start_of_payload = 0;
@@ -506,7 +508,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
if (payload_offset >= 0 && payload_offset < pos)
{
/* At least part of the buffer contains data from payload. */
- off_t start_of_payload = payload_offset - (pos - sum);
+ wgint start_of_payload = payload_offset - (pos - sum);
if (start_of_payload <= 0)
/* All bytes in the buffer belong to the payload. */
start_of_payload = 0;
@@ -1170,7 +1172,7 @@ warc_tempfile (void)
Returns true on success, false on error. */
bool
warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
- ip_address *ip, FILE *body, off_t payload_offset)
+ ip_address *ip, FILE *body, wgint payload_offset)
{
warc_write_start_record ();
warc_write_header ("WARC-Type", "request");
@@ -1205,7 +1207,7 @@ static bool
warc_write_cdx_record (const char *url, const char *timestamp_str,
const char *mime_type, int response_code,
const char *payload_digest, const char *redirect_location,
- off_t offset, const char *warc_filename,
+ wgint offset, const char *warc_filename,
const char *response_uuid)
{
/* Transform the timestamp. */
@@ -1304,7 +1306,7 @@ warc_write_revisit_record (char *url, char *timestamp_str,
bool
warc_write_response_record (char *url, char *timestamp_str,
char *concurrent_to_uuid, ip_address *ip,
- FILE *body, off_t payload_offset, char *mime_type,
+ FILE *body, wgint payload_offset, char *mime_type,
int response_code, char *redirect_location)
{
char *block_digest = NULL;
@@ -1359,7 +1361,7 @@ warc_write_response_record (char *url, char *timestamp_str,
warc_uuid_str (response_uuid);
fseeko (warc_current_file, 0L, SEEK_END);
- off_t offset = ftello (warc_current_file);
+ wgint offset = ftello (warc_current_file);
warc_write_start_record ();
warc_write_header ("WARC-Type", "response");
@@ -1408,7 +1410,7 @@ bool
warc_write_resource_record (char *resource_uuid, const char *url,
const char *timestamp_str, const char *concurrent_to_uuid,
ip_address *ip, const char *content_type, FILE *body,
- off_t payload_offset)
+ wgint payload_offset)
{
if (resource_uuid == NULL)
{
diff --git a/src/warc.h b/src/warc.h
index eba640d..dca149d 100644
--- a/src/warc.h
+++ b/src/warc.h
@@ -12,12 +12,12 @@ void warc_uuid_str (char *id_str);
FILE * warc_tempfile (void);
bool warc_write_request_record (char *url, char *timestamp_str,
- char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset);
+ char *concurrent_to_uuid, ip_address *ip, FILE *body, wgint payload_offset);
bool warc_write_response_record (char *url, char *timestamp_str,
- char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset,
+ char *concurrent_to_uuid, ip_address *ip, FILE *body, wgint payload_offset,
char *mime_type, int response_code, char *redirect_location);
bool warc_write_resource_record (char *resource_uuid, const char *url,
const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip,
- const char *content_type, FILE *body, off_t payload_offset);
+ const char *content_type, FILE *body, wgint payload_offset);
#endif /* WARC_H */