Hi, Two patches to WARC code and one for the Perl test suite for reviewing.
BTW, we have no tests for the WARC code, do we ? Any ideas for an implementation ? Regards, Tim
From 54e60b7a195d5567722e3da20e19769720d15462 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim Rühsen?= <[email protected]> Date: Tue, 14 Apr 2015 10:57:11 +0200 Subject: [PATCH 1/3] Add more const usage to function params * warc.c, warc.h: Add const specifier to several function args --- src/warc.c | 43 +++++++++++++++++++++++-------------------- src/warc.h | 16 ++++++++-------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/src/warc.c b/src/warc.c index 67d0ae4..f2a328b 100644 --- a/src/warc.c +++ b/src/warc.c @@ -404,7 +404,7 @@ warc_write_date_header (const char *timestamp) the current WARC record. If IP is NULL, no header will be written. */ static bool -warc_write_ip_header (ip_address *ip) +warc_write_ip_header (const ip_address *ip) { if (ip != NULL) return warc_write_header ("WARC-IP-Address", print_address (ip)); @@ -542,7 +542,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, /* Converts the SHA1 digest to a base32-encoded string. "sha1:DIGEST\0" (Allocates a new string for the response.) */ static char * -warc_base32_sha1_digest (char *sha1_digest) +warc_base32_sha1_digest (const char *sha1_digest) { /* length: "sha1:" + digest + "\0" */ char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 ); @@ -734,7 +734,7 @@ warc_uuid_str (char *urn_str) /* Write a warcinfo record to the current file. Updates warc_current_warcinfo_uuid_str. */ static bool -warc_write_warcinfo_record (char *filename) +warc_write_warcinfo_record (const char *filename) { FILE *warc_tmp; char timestamp[22]; @@ -1098,7 +1098,7 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); digest. Returns NULL if the url is not found or if the payload digest does not match, or if CDX deduplication is disabled. */ static struct warc_cdx_record * -warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload) +warc_find_duplicate_cdx_record (const char *url, char *sha1_digest_payload) { struct warc_cdx_record *rec_existing; @@ -1293,8 +1293,9 @@ warc_tempfile (void) Calling this function will close body. Returns true on success, false on error. */ bool -warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, - ip_address *ip, FILE *body, off_t payload_offset) +warc_write_request_record (const char *url, const char *timestamp_str, + const char *record_uuid, const ip_address *ip, + FILE *body, off_t payload_offset) { warc_write_start_record (); warc_write_header ("WARC-Type", "request"); @@ -1382,9 +1383,9 @@ warc_write_cdx_record (const char *url, const char *timestamp_str, Calling this function will close body. Returns true on success, false on error. */ static bool -warc_write_revisit_record (char *url, char *timestamp_str, - char *concurrent_to_uuid, char *payload_digest, - char *refers_to, ip_address *ip, FILE *body) +warc_write_revisit_record (const char *url, const char *timestamp_str, + const char *concurrent_to_uuid, const char *payload_digest, + const char *refers_to, const ip_address *ip, FILE *body) { char revisit_uuid [48]; char *block_digest = NULL; @@ -1432,10 +1433,10 @@ warc_write_revisit_record (char *url, char *timestamp_str, Calling this function will close body. Returns true on success, false on error. */ bool -warc_write_response_record (char *url, char *timestamp_str, - char *concurrent_to_uuid, ip_address *ip, - FILE *body, off_t payload_offset, char *mime_type, - int response_code, char *redirect_location) +warc_write_response_record (const char *url, const char *timestamp_str, + const char *concurrent_to_uuid, const ip_address *ip, + FILE *body, off_t payload_offset, const char *mime_type, + int response_code, const char *redirect_location) { char *block_digest = NULL; char *payload_digest = NULL; @@ -1535,16 +1536,18 @@ warc_write_response_record (char *url, char *timestamp_str, Calling this function will close body. Returns true on success, false on error. */ static bool -warc_write_record (const char *record_type, char *resource_uuid, +warc_write_record (const char *record_type, const char *resource_uuid, const char *url, const char *timestamp_str, const char *concurrent_to_uuid, - ip_address *ip, const char *content_type, FILE *body, + const ip_address *ip, const char *content_type, FILE *body, off_t payload_offset) { if (resource_uuid == NULL) { - resource_uuid = alloca (48); - warc_uuid_str (resource_uuid); + /* using uuid_buf allows const for resource_uuid in function declaration */ + char *uuid_buf = alloca (48); + warc_uuid_str (uuid_buf); + resource_uuid = uuid_buf; } if (content_type == NULL) @@ -1580,9 +1583,9 @@ warc_write_record (const char *record_type, char *resource_uuid, Calling this function will close body. Returns true on success, false on error. */ bool -warc_write_resource_record (char *resource_uuid, const char *url, +warc_write_resource_record (const char *resource_uuid, const char *url, const char *timestamp_str, const char *concurrent_to_uuid, - ip_address *ip, const char *content_type, FILE *body, + const ip_address *ip, const char *content_type, FILE *body, off_t payload_offset) { return warc_write_record ("resource", @@ -1602,7 +1605,7 @@ warc_write_resource_record (char *resource_uuid, const char *url, Calling this function will close body. Returns true on success, false on error. */ bool -warc_write_metadata_record (char *record_uuid, const char *url, +warc_write_metadata_record (const char *record_uuid, const char *url, const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip, const char *content_type, FILE *body, off_t payload_offset) diff --git a/src/warc.h b/src/warc.h index 25e07b9..b34e9e0 100644 --- a/src/warc.h +++ b/src/warc.h @@ -12,15 +12,15 @@ char * warc_timestamp (char *timestamp, size_t timestamp_size); FILE * warc_tempfile (void); -bool warc_write_request_record (char *url, char *timestamp_str, - char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset); -bool warc_write_response_record (char *url, char *timestamp_str, - char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, - char *mime_type, int response_code, char *redirect_location); -bool warc_write_resource_record (char *resource_uuid, const char *url, - const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip, +bool warc_write_request_record (const char *url, const char *timestamp_str, + const char *concurrent_to_uuid, const ip_address *ip, FILE *body, off_t payload_offset); +bool warc_write_response_record (const char *url, const char *timestamp_str, + const char *concurrent_to_uuid, const ip_address *ip, FILE *body, off_t payload_offset, + const char *mime_type, int response_code, const char *redirect_location); +bool warc_write_resource_record (const char *resource_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, const ip_address *ip, const char *content_type, FILE *body, off_t payload_offset); -bool warc_write_metadata_record (char *record_uuid, const char *url, +bool warc_write_metadata_record (const char *record_uuid, const char *url, const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip, const char *content_type, FILE *body, off_t payload_offset); -- 2.1.4
From 2669a1dfb2c409e8ed21b55f5836fdb0e4a7cb50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim Rühsen?= <[email protected]> Date: Tue, 14 Apr 2015 11:54:55 +0200 Subject: [PATCH 2/3] Check memory allocations in WARC code * src/warc.c: Remove some memory allocations, use xmalloc instead of malloc Reported-by: Bill Parker <[email protected]> --- src/warc.c | 66 +++++++++++++++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/src/warc.c b/src/warc.c index f2a328b..2f1873e 100644 --- a/src/warc.c +++ b/src/warc.c @@ -102,7 +102,7 @@ static bool warc_write_ok; static FILE *warc_current_cdx_file; /* The record id of the warcinfo record of the current WARC file. */ -static char *warc_current_warcinfo_uuid_str; +static char warc_current_warcinfo_uuid_str[48]; /* The file name of the current WARC file. */ static char *warc_current_filename; @@ -435,9 +435,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, off_t pos; off_t sum; - char *buffer = malloc (BLOCKSIZE + 72); - if (!buffer) - return 1; + char *buffer = xmalloc (BLOCKSIZE + 72); /* Initialize the computation context. */ sha1_init_ctx (&ctx_block); @@ -542,14 +540,17 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, /* Converts the SHA1 digest to a base32-encoded string. "sha1:DIGEST\0" (Allocates a new string for the response.) */ static char * -warc_base32_sha1_digest (const char *sha1_digest) +warc_base32_sha1_digest (const char *sha1_digest, char *sha1_base32, size_t sha1_base32_size) { - /* length: "sha1:" + digest + "\0" */ - char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 ); - base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, - BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1); - memcpy (sha1_base32, "sha1:", 5); - sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0'; + if (sha1_base32_size >= BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5 + 1) + { + memcpy (sha1_base32, "sha1:", 5); + base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, + sha1_base32_size - 5); + } + else + *sha1_base32 = 0; + return sha1_base32; } @@ -571,18 +572,14 @@ warc_write_digest_headers (FILE *file, long payload_offset) if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0) { - char *digest; + char digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5]; - digest = warc_base32_sha1_digest (sha1_res_block); - warc_write_header ("WARC-Block-Digest", digest); - xfree (digest); + warc_write_header ("WARC-Block-Digest", + warc_base32_sha1_digest (sha1_res_block, digest, sizeof(digest))); if (payload_offset >= 0) - { - digest = warc_base32_sha1_digest (sha1_res_payload); - warc_write_header ("WARC-Payload-Digest", digest); - xfree (digest); - } + warc_write_header ("WARC-Payload-Digest", + warc_base32_sha1_digest (sha1_res_payload, digest, sizeof(digest))); } } } @@ -743,7 +740,6 @@ warc_write_warcinfo_record (const char *filename) /* Write warc-info record as the first record of the file. */ /* We add the record id of this info record to the other records in the file. */ - warc_current_warcinfo_uuid_str = (char *) malloc (48); warc_uuid_str (warc_current_warcinfo_uuid_str); warc_timestamp (timestamp, sizeof(timestamp)); @@ -827,14 +823,15 @@ warc_start_new_file (bool meta) if (warc_current_file != NULL) fclose (warc_current_file); - xfree (warc_current_warcinfo_uuid_str); + *warc_current_warcinfo_uuid_str = 0; xfree (warc_current_filename); warc_current_file_number++; base_filename_length = strlen (opt.warc_filename); /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */ - new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); + new_filename = xmalloc (base_filename_length + 1 + 5 + 8 + 1); + warc_current_filename = new_filename; /* If max size is enabled, we add a serial number to the file names. */ @@ -995,7 +992,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, { /* This is a valid line with a valid checksum. */ struct warc_cdx_record *rec; - rec = malloc (sizeof (struct warc_cdx_record)); + rec = xmalloc (sizeof (struct warc_cdx_record)); rec->url = original_url; rec->uuid = record_id; memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE); @@ -1228,7 +1225,7 @@ warc_close (void) if (warc_current_file != NULL) { warc_write_metadata (); - xfree (warc_current_warcinfo_uuid_str); + *warc_current_warcinfo_uuid_str = 0; fclose (warc_current_file); } if (warc_current_cdx_file != NULL) @@ -1388,13 +1385,13 @@ warc_write_revisit_record (const char *url, const char *timestamp_str, const char *refers_to, const ip_address *ip, FILE *body) { char revisit_uuid [48]; - char *block_digest = NULL; + char block_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5]; char sha1_res_block[SHA1_DIGEST_SIZE]; warc_uuid_str (revisit_uuid); sha1_stream (body, sha1_res_block); - block_digest = warc_base32_sha1_digest (sha1_res_block); + warc_base32_sha1_digest (sha1_res_block, block_digest, sizeof(block_digest)); warc_write_start_record (); warc_write_header ("WARC-Type", "revisit"); @@ -1414,7 +1411,6 @@ warc_write_revisit_record (const char *url, const char *timestamp_str, warc_write_end_record (); fclose (body); - xfree (block_digest); return warc_write_ok; } @@ -1438,8 +1434,8 @@ warc_write_response_record (const char *url, const char *timestamp_str, FILE *body, off_t payload_offset, const char *mime_type, int response_code, const char *redirect_location) { - char *block_digest = NULL; - char *payload_digest = NULL; + char block_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5]; + char payload_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5]; char sha1_res_block[SHA1_DIGEST_SIZE]; char sha1_res_payload[SHA1_DIGEST_SIZE]; char response_uuid [48]; @@ -1472,17 +1468,16 @@ warc_write_response_record (const char *url, const char *timestamp_str, } /* Send the original payload digest. */ - payload_digest = warc_base32_sha1_digest (sha1_res_payload); + warc_base32_sha1_digest (sha1_res_payload, payload_digest, sizeof(payload_digest)); result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body); - xfree (payload_digest); return result; } - block_digest = warc_base32_sha1_digest (sha1_res_block); - payload_digest = warc_base32_sha1_digest (sha1_res_payload); + warc_base32_sha1_digest (sha1_res_block, block_digest, sizeof(block_digest)); + warc_base32_sha1_digest (sha1_res_payload, payload_digest, sizeof(payload_digest)); } } @@ -1517,9 +1512,6 @@ warc_write_response_record (const char *url, const char *timestamp_str, response_uuid); } - xfree (block_digest); - xfree (payload_digest); - return warc_write_ok; } -- 2.1.4
From b005448a9c17b1784d61238163db4aa6ab661fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim Rühsen?= <[email protected]> Date: Tue, 14 Apr 2015 12:14:21 +0200 Subject: [PATCH 3/3] Silence warning in perl test suite * tests/WgetTests.pm: Use string comparisons for $valgrind variable --- tests/WgetTests.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/WgetTests.pm b/tests/WgetTests.pm index 3427a1b..501aeba 100644 --- a/tests/WgetTests.pm +++ b/tests/WgetTests.pm @@ -135,14 +135,14 @@ sub run { $cmdline = 'gdb --args ' . $cmdline; } - elsif ($valgrind == 1) + elsif ($valgrind eq "1") { $cmdline 'valgrind --suppressions=' . $VALGRIND_SUPP_FILE . ' --error-exitcode01 --leak-check=yes --track-origins=yes ' . $cmdline; } - elsif ($valgrind ne q{} && $valgrind != 0) + elsif ($valgrind ne q{} && $valgrind ne "0") { $cmdline = "$valgrind $cmdline"; } -- 2.1.4
signature.asc
Description: This is a digitally signed message part.
