Hi,

Two patches to WARC code and one for the Perl test suite for reviewing.

BTW, we have no tests for the WARC code, do we ?
Any ideas for an implementation ?

Regards, Tim
From 54e60b7a195d5567722e3da20e19769720d15462 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim Rühsen?= <[email protected]>
Date: Tue, 14 Apr 2015 10:57:11 +0200
Subject: [PATCH 1/3] Add more const usage to function params

* warc.c, warc.h: Add const specifier to several function args
---
 src/warc.c | 43 +++++++++++++++++++++++--------------------
 src/warc.h | 16 ++++++++--------
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/src/warc.c b/src/warc.c
index 67d0ae4..f2a328b 100644
--- a/src/warc.c
+++ b/src/warc.c
@@ -404,7 +404,7 @@ warc_write_date_header (const char *timestamp)
    the current WARC record.  If IP is NULL, no header will
    be written.  */
 static bool
-warc_write_ip_header (ip_address *ip)
+warc_write_ip_header (const ip_address *ip)
 {
   if (ip != NULL)
     return warc_write_header ("WARC-IP-Address", print_address (ip));
@@ -542,7 +542,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
 /* Converts the SHA1 digest to a base32-encoded string.
    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 static char *
-warc_base32_sha1_digest (char *sha1_digest)
+warc_base32_sha1_digest (const char *sha1_digest)
 {
   /* length: "sha1:" + digest + "\0" */
   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
@@ -734,7 +734,7 @@ warc_uuid_str (char *urn_str)
 /* Write a warcinfo record to the current file.
    Updates warc_current_warcinfo_uuid_str. */
 static bool
-warc_write_warcinfo_record (char *filename)
+warc_write_warcinfo_record (const char *filename)
 {
   FILE *warc_tmp;
   char timestamp[22];
@@ -1098,7 +1098,7 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
    digest.  Returns NULL if the url is not found or if the payload digest
    does not match, or if CDX deduplication is disabled. */
 static struct warc_cdx_record *
-warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
+warc_find_duplicate_cdx_record (const char *url, char *sha1_digest_payload)
 {
   struct warc_cdx_record *rec_existing;

@@ -1293,8 +1293,9 @@ warc_tempfile (void)
    Calling this function will close body.
    Returns true on success, false on error. */
 bool
-warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
-                           ip_address *ip, FILE *body, off_t payload_offset)
+warc_write_request_record (const char *url, const char *timestamp_str,
+                           const char *record_uuid, const ip_address *ip,
+                           FILE *body, off_t payload_offset)
 {
   warc_write_start_record ();
   warc_write_header ("WARC-Type", "request");
@@ -1382,9 +1383,9 @@ warc_write_cdx_record (const char *url, const char *timestamp_str,
    Calling this function will close body.
    Returns true on success, false on error. */
 static bool
-warc_write_revisit_record (char *url, char *timestamp_str,
-                           char *concurrent_to_uuid, char *payload_digest,
-                           char *refers_to, ip_address *ip, FILE *body)
+warc_write_revisit_record (const char *url, const char *timestamp_str,
+                           const char *concurrent_to_uuid, const char *payload_digest,
+                           const char *refers_to, const ip_address *ip, FILE *body)
 {
   char revisit_uuid [48];
   char *block_digest = NULL;
@@ -1432,10 +1433,10 @@ warc_write_revisit_record (char *url, char *timestamp_str,
    Calling this function will close body.
    Returns true on success, false on error. */
 bool
-warc_write_response_record (char *url, char *timestamp_str,
-                            char *concurrent_to_uuid, ip_address *ip,
-                            FILE *body, off_t payload_offset, char *mime_type,
-                            int response_code, char *redirect_location)
+warc_write_response_record (const char *url, const char *timestamp_str,
+                            const char *concurrent_to_uuid, const ip_address *ip,
+                            FILE *body, off_t payload_offset, const char *mime_type,
+                            int response_code, const char *redirect_location)
 {
   char *block_digest = NULL;
   char *payload_digest = NULL;
@@ -1535,16 +1536,18 @@ warc_write_response_record (char *url, char *timestamp_str,
    Calling this function will close body.
    Returns true on success, false on error. */
 static bool
-warc_write_record (const char *record_type, char *resource_uuid,
+warc_write_record (const char *record_type, const char *resource_uuid,
                  const char *url, const char *timestamp_str,
                  const char *concurrent_to_uuid,
-                 ip_address *ip, const char *content_type, FILE *body,
+                 const ip_address *ip, const char *content_type, FILE *body,
                  off_t payload_offset)
 {
   if (resource_uuid == NULL)
     {
-      resource_uuid = alloca (48);
-      warc_uuid_str (resource_uuid);
+      /* using uuid_buf allows const for resource_uuid in function declaration */
+      char *uuid_buf = alloca (48);
+      warc_uuid_str (uuid_buf);
+      resource_uuid = uuid_buf;
     }

   if (content_type == NULL)
@@ -1580,9 +1583,9 @@ warc_write_record (const char *record_type, char *resource_uuid,
    Calling this function will close body.
    Returns true on success, false on error. */
 bool
-warc_write_resource_record (char *resource_uuid, const char *url,
+warc_write_resource_record (const char *resource_uuid, const char *url,
                  const char *timestamp_str, const char *concurrent_to_uuid,
-                 ip_address *ip, const char *content_type, FILE *body,
+                 const ip_address *ip, const char *content_type, FILE *body,
                  off_t payload_offset)
 {
   return warc_write_record ("resource",
@@ -1602,7 +1605,7 @@ warc_write_resource_record (char *resource_uuid, const char *url,
    Calling this function will close body.
    Returns true on success, false on error. */
 bool
-warc_write_metadata_record (char *record_uuid, const char *url,
+warc_write_metadata_record (const char *record_uuid, const char *url,
                  const char *timestamp_str, const char *concurrent_to_uuid,
                  ip_address *ip, const char *content_type, FILE *body,
                  off_t payload_offset)
diff --git a/src/warc.h b/src/warc.h
index 25e07b9..b34e9e0 100644
--- a/src/warc.h
+++ b/src/warc.h
@@ -12,15 +12,15 @@ char * warc_timestamp (char *timestamp, size_t timestamp_size);

 FILE * warc_tempfile (void);

-bool warc_write_request_record (char *url, char *timestamp_str,
-  char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset);
-bool warc_write_response_record (char *url, char *timestamp_str,
-  char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset,
-  char *mime_type, int response_code, char *redirect_location);
-bool warc_write_resource_record (char *resource_uuid, const char *url,
-  const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip,
+bool warc_write_request_record (const char *url, const char *timestamp_str,
+  const char *concurrent_to_uuid, const ip_address *ip, FILE *body, off_t payload_offset);
+bool warc_write_response_record (const char *url, const char *timestamp_str,
+  const char *concurrent_to_uuid, const ip_address *ip, FILE *body, off_t payload_offset,
+  const char *mime_type, int response_code, const char *redirect_location);
+bool warc_write_resource_record (const char *resource_uuid, const char *url,
+  const char *timestamp_str, const char *concurrent_to_uuid, const ip_address *ip,
   const char *content_type, FILE *body, off_t payload_offset);
-bool warc_write_metadata_record (char *record_uuid, const char *url,
+bool warc_write_metadata_record (const char *record_uuid, const char *url,
   const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip,
   const char *content_type, FILE *body, off_t payload_offset);

--
2.1.4

From 2669a1dfb2c409e8ed21b55f5836fdb0e4a7cb50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim Rühsen?= <[email protected]>
Date: Tue, 14 Apr 2015 11:54:55 +0200
Subject: [PATCH 2/3] Check memory allocations in WARC code

* src/warc.c: Remove some memory allocations,
              use xmalloc instead of malloc

Reported-by: Bill Parker <[email protected]>
---
 src/warc.c | 66 +++++++++++++++++++++++++++-----------------------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

diff --git a/src/warc.c b/src/warc.c
index f2a328b..2f1873e 100644
--- a/src/warc.c
+++ b/src/warc.c
@@ -102,7 +102,7 @@ static bool warc_write_ok;
 static FILE *warc_current_cdx_file;

 /* The record id of the warcinfo record of the current WARC file.  */
-static char *warc_current_warcinfo_uuid_str;
+static char warc_current_warcinfo_uuid_str[48];

 /* The file name of the current WARC file. */
 static char *warc_current_filename;
@@ -435,9 +435,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
   off_t pos;
   off_t sum;

-  char *buffer = malloc (BLOCKSIZE + 72);
-  if (!buffer)
-    return 1;
+  char *buffer = xmalloc (BLOCKSIZE + 72);

   /* Initialize the computation context.  */
   sha1_init_ctx (&ctx_block);
@@ -542,14 +540,17 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
 /* Converts the SHA1 digest to a base32-encoded string.
    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 static char *
-warc_base32_sha1_digest (const char *sha1_digest)
+warc_base32_sha1_digest (const char *sha1_digest, char *sha1_base32, size_t sha1_base32_size)
 {
-  /* length: "sha1:" + digest + "\0" */
-  char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
-  base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
-                 BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
-  memcpy (sha1_base32, "sha1:", 5);
-  sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
+  if (sha1_base32_size >= BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5 + 1)
+    {
+      memcpy (sha1_base32, "sha1:", 5);
+      base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
+                     sha1_base32_size - 5);
+    }
+  else
+    *sha1_base32 = 0;
+
   return sha1_base32;
 }

@@ -571,18 +572,14 @@ warc_write_digest_headers (FILE *file, long payload_offset)
       if (warc_sha1_stream_with_payload (file, sha1_res_block,
           sha1_res_payload, payload_offset) == 0)
         {
-          char *digest;
+          char digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];

-          digest = warc_base32_sha1_digest (sha1_res_block);
-          warc_write_header ("WARC-Block-Digest", digest);
-          xfree (digest);
+          warc_write_header ("WARC-Block-Digest",
+              warc_base32_sha1_digest (sha1_res_block, digest, sizeof(digest)));

           if (payload_offset >= 0)
-            {
-              digest = warc_base32_sha1_digest (sha1_res_payload);
-              warc_write_header ("WARC-Payload-Digest", digest);
-              xfree (digest);
-            }
+              warc_write_header ("WARC-Payload-Digest",
+                  warc_base32_sha1_digest (sha1_res_payload, digest, sizeof(digest)));
         }
     }
 }
@@ -743,7 +740,6 @@ warc_write_warcinfo_record (const char *filename)
   /* Write warc-info record as the first record of the file. */
   /* We add the record id of this info record to the other records in the
      file. */
-  warc_current_warcinfo_uuid_str = (char *) malloc (48);
   warc_uuid_str (warc_current_warcinfo_uuid_str);

   warc_timestamp (timestamp, sizeof(timestamp));
@@ -827,14 +823,15 @@ warc_start_new_file (bool meta)
   if (warc_current_file != NULL)
     fclose (warc_current_file);

-  xfree (warc_current_warcinfo_uuid_str);
+  *warc_current_warcinfo_uuid_str = 0;
   xfree (warc_current_filename);

   warc_current_file_number++;

   base_filename_length = strlen (opt.warc_filename);
   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
-  new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
+  new_filename = xmalloc (base_filename_length + 1 + 5 + 8 + 1);
+
   warc_current_filename = new_filename;

   /* If max size is enabled, we add a serial number to the file names. */
@@ -995,7 +992,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
         {
           /* This is a valid line with a valid checksum. */
           struct warc_cdx_record *rec;
-          rec = malloc (sizeof (struct warc_cdx_record));
+          rec = xmalloc (sizeof (struct warc_cdx_record));
           rec->url = original_url;
           rec->uuid = record_id;
           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
@@ -1228,7 +1225,7 @@ warc_close (void)
   if (warc_current_file != NULL)
     {
       warc_write_metadata ();
-      xfree (warc_current_warcinfo_uuid_str);
+      *warc_current_warcinfo_uuid_str = 0;
       fclose (warc_current_file);
     }
   if (warc_current_cdx_file != NULL)
@@ -1388,13 +1385,13 @@ warc_write_revisit_record (const char *url, const char *timestamp_str,
                            const char *refers_to, const ip_address *ip, FILE *body)
 {
   char revisit_uuid [48];
-  char *block_digest = NULL;
+  char block_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
   char sha1_res_block[SHA1_DIGEST_SIZE];

   warc_uuid_str (revisit_uuid);

   sha1_stream (body, sha1_res_block);
-  block_digest = warc_base32_sha1_digest (sha1_res_block);
+  warc_base32_sha1_digest (sha1_res_block, block_digest, sizeof(block_digest));

   warc_write_start_record ();
   warc_write_header ("WARC-Type", "revisit");
@@ -1414,7 +1411,6 @@ warc_write_revisit_record (const char *url, const char *timestamp_str,
   warc_write_end_record ();

   fclose (body);
-  xfree (block_digest);

   return warc_write_ok;
 }
@@ -1438,8 +1434,8 @@ warc_write_response_record (const char *url, const char *timestamp_str,
                             FILE *body, off_t payload_offset, const char *mime_type,
                             int response_code, const char *redirect_location)
 {
-  char *block_digest = NULL;
-  char *payload_digest = NULL;
+  char block_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
+  char payload_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
   char sha1_res_block[SHA1_DIGEST_SIZE];
   char sha1_res_payload[SHA1_DIGEST_SIZE];
   char response_uuid [48];
@@ -1472,17 +1468,16 @@ warc_write_response_record (const char *url, const char *timestamp_str,
                 }

               /* Send the original payload digest. */
-              payload_digest = warc_base32_sha1_digest (sha1_res_payload);
+              warc_base32_sha1_digest (sha1_res_payload, payload_digest, sizeof(payload_digest));
               result = warc_write_revisit_record (url, timestamp_str,
                          concurrent_to_uuid, payload_digest, rec_existing->uuid,
                          ip, body);
-              xfree (payload_digest);

               return result;
             }

-          block_digest = warc_base32_sha1_digest (sha1_res_block);
-          payload_digest = warc_base32_sha1_digest (sha1_res_payload);
+          warc_base32_sha1_digest (sha1_res_block, block_digest, sizeof(block_digest));
+          warc_base32_sha1_digest (sha1_res_payload, payload_digest, sizeof(payload_digest));
         }
     }

@@ -1517,9 +1512,6 @@ warc_write_response_record (const char *url, const char *timestamp_str,
       response_uuid);
     }

-  xfree (block_digest);
-  xfree (payload_digest);
-
   return warc_write_ok;
 }

--
2.1.4

From b005448a9c17b1784d61238163db4aa6ab661fea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim Rühsen?= <[email protected]>
Date: Tue, 14 Apr 2015 12:14:21 +0200
Subject: [PATCH 3/3] Silence warning in perl test suite

* tests/WgetTests.pm: Use string comparisons for $valgrind variable
---
 tests/WgetTests.pm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/WgetTests.pm b/tests/WgetTests.pm
index 3427a1b..501aeba 100644
--- a/tests/WgetTests.pm
+++ b/tests/WgetTests.pm
@@ -135,14 +135,14 @@ sub run
     {
         $cmdline = 'gdb --args ' . $cmdline;
     }
-    elsif ($valgrind == 1)
+    elsif ($valgrind eq "1")
     {
         $cmdline            'valgrind --suppressions=' . $VALGRIND_SUPP_FILE
           . ' --error-exitcode01 --leak-check=yes --track-origins=yes '
           . $cmdline;
     }
-    elsif ($valgrind ne q{} && $valgrind != 0)
+    elsif ($valgrind ne q{} && $valgrind ne "0")
     {
         $cmdline = "$valgrind $cmdline";
     }
--
2.1.4

Attachment: signature.asc
Description: This is a digitally signed message part.

Reply via email to