Another couple of patches, these for warc.c

>From e3100a62b8a2e31ea3458bc895002e5e537903d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81ngel=20Gonz=C3=A1lez?= <[email protected]>
Date: Wed, 14 Nov 2012 18:16:50 +0100
Subject: [PATCH 1/2] Removed most mixed declarations and code from warc.c.

---
 src/ChangeLog |  2 +-
 src/warc.c    | 81 ++++++++++++++++++++++++++++++++++++-----------------------
 2 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index 5ab9321..713a6ba 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -3,7 +3,7 @@
 	gcc -std=c89
 	* host.c (cache_query): Fix warning about format '%p' expecting
 	an argument of type 'void *', but being given a 'struct address_list *'
-	* gnutls.c, html-url.c, http.c, retr.c, main.c: Removed mixed 
+	* gnutls.c, html-url.c, http.c, retr.c, main.c, warc.c: Removed mixed 
 	declarations and code, making more C89 compatible.
 
 2012-11-13  Giuseppe Scrivano  <[email protected]>
diff --git a/src/warc.c b/src/warc.c
index 437502c..04e11e4 100644
--- a/src/warc.c
+++ b/src/warc.c
@@ -154,10 +154,12 @@ warc_write_buffer (const char *buffer, size_t size)
 static bool
 warc_write_string (const char *str)
 {
+  size_t n;
+
   if (!warc_write_ok)
     return false;
 
-  size_t n = strlen (str);
+  n = strlen (str);
   if (n != warc_write_buffer (str, n))
     warc_write_ok = false;
 
@@ -246,6 +248,9 @@ warc_write_block_from_file (FILE *data_in)
 {
   /* Add the Content-Length header. */
   char *content_length;
+  char buffer[BUFSIZ];
+  size_t s;
+
   fseeko (data_in, 0L, SEEK_END);
   if (! asprintf (&content_length, "%ld", ftello (data_in)))
     {
@@ -262,8 +267,6 @@ warc_write_block_from_file (FILE *data_in)
     warc_write_ok = false;
 
   /* Copy the data in the file to the WARC record. */
-  char buffer[BUFSIZ];
-  size_t s;
   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
     {
       if (warc_write_buffer (buffer, s) < s)
@@ -649,13 +652,14 @@ warc_write_warcinfo_record (char *filename)
   /* Write warc-info record as the first record of the file. */
   /* We add the record id of this info record to the other records in the
      file. */
+  char timestamp[22]; FILE *warc_tmp;
+  char *filename_copy, *filename_basename;
+
   warc_current_warcinfo_uuid_str = (char *) malloc (48);
   warc_uuid_str (warc_current_warcinfo_uuid_str);
 
-  char timestamp[22];
   warc_timestamp (timestamp);
 
-  char *filename_copy, *filename_basename;
   filename_copy = strdup (filename);
   filename_basename = strdup (basename (filename_copy));
 
@@ -667,7 +671,7 @@ warc_write_warcinfo_record (char *filename)
   warc_write_header ("WARC-Filename", filename_basename);
 
   /* Create content.  */
-  FILE *warc_tmp = warc_tempfile ();
+  warc_tmp = warc_tempfile ();
   if (warc_tmp == NULL)
     {
       free (filename_copy);
@@ -717,6 +721,15 @@ warc_write_warcinfo_record (char *filename)
 static bool
 warc_start_new_file (bool meta)
 {
+  int base_filename_length;
+  char *new_filename;
+
+#ifdef HAVE_LIBZ
+  const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
+#else
+  const char *extension = "warc";
+#endif
+
   if (opt.warc_filename == NULL)
     return false;
 
@@ -729,17 +742,11 @@ warc_start_new_file (bool meta)
 
   warc_current_file_number++;
 
-  int base_filename_length = strlen (opt.warc_filename);
+  base_filename_length = strlen (opt.warc_filename);
   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
-  char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
+  new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
   warc_current_filename = new_filename;
 
-#ifdef HAVE_LIBZ
-  const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
-#else
-  const char *extension = "warc";
-#endif
-
   /* If max size is enabled, we add a serial number to the file names. */
   if (meta)
     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
@@ -811,12 +818,13 @@ static bool
 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
                        int *field_num_checksum, int *field_num_record_id)
 {
+  char *token;
+  char *save_ptr;
+
   *field_num_original_url = -1;
   *field_num_checksum = -1;
   *field_num_record_id = -1;
 
-  char *token;
-  char *save_ptr;
   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 
   if (token != NULL && strcmp (token, "CDX") == 0)
@@ -860,10 +868,11 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
 
   char *token;
   char *save_ptr;
+  int field_num = 0;
+
   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 
   /* Read this line to get the fields we need. */
-  int field_num = 0;
   while (token != NULL)
     {
       char **val;
@@ -926,10 +935,6 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
 static bool
 warc_load_cdx_dedup_file (void)
 {
-  FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
-  if (f == NULL)
-    return false;
-
   int field_num_original_url = -1;
   int field_num_checksum = -1;
   int field_num_record_id = -1;
@@ -938,6 +943,10 @@ warc_load_cdx_dedup_file (void)
   size_t n = 0;
   ssize_t line_length;
 
+  FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
+  if (f == NULL)
+    return false;
+
   /* The first line should contain the CDX header.
      Format:  " CDX x x x x x"
      where x are field type indicators.  For our purposes, we only
@@ -965,6 +974,7 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
     }
   else
     {
+      int nrecords;
       /* Initialize the table. */
       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
                                              warc_cmp_sha1_digest);
@@ -982,7 +992,7 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
       while (line_length != -1);
 
       /* Print results. */
-      int nrecords = hash_table_count (warc_cdx_dedup_table);
+      nrecords = hash_table_count (warc_cdx_dedup_table);
       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
                                         "Loaded %d records from CDX.\n\n",
                                          nrecords),
@@ -1002,11 +1012,12 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 static struct warc_cdx_record *
 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
 {
+  struct warc_cdx_record *rec_existing;
+
   if (warc_cdx_dedup_table == NULL)
     return NULL;
 
-  struct warc_cdx_record *rec_existing
-    = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
+  rec_existing = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
 
   if (rec_existing && strcmp (rec_existing->url, url) == 0)
     return rec_existing;
@@ -1077,11 +1088,13 @@ warc_init (void)
 static void
 warc_write_metadata (void)
 {
+  char manifest_uuid [48];
+  FILE *warc_tmp_fp;
+
   /* If there are multiple WARC files, the metadata should be written to a separate file. */
   if (opt.warc_maxsize > 0)
     warc_start_new_file (true);
 
-  char manifest_uuid [48];
   warc_uuid_str (manifest_uuid);
 
   fflush (warc_manifest_fp);
@@ -1091,7 +1104,7 @@ warc_write_metadata (void)
                               warc_manifest_fp, -1);
   /* warc_write_resource_record has closed warc_manifest_fp. */
 
-  FILE * warc_tmp_fp = warc_tempfile ();
+  warc_tmp_fp = warc_tempfile ();
   if (warc_tmp_fp == NULL)
     {
       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
@@ -1146,10 +1159,12 @@ FILE *
 warc_tempfile (void)
 {
   char filename[100];
+  int fd;
+
   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
     return NULL;
 
-  int fd = mkstemp (filename);
+  fd = mkstemp (filename);
   if (fd < 0)
     return NULL;
 
@@ -1210,6 +1225,8 @@ warc_write_cdx_record (const char *url, const char *timestamp_str,
 {
   /* Transform the timestamp. */
   char timestamp_str_cdx [15];
+  const char *checksum;;
+
   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
@@ -1219,7 +1236,6 @@ warc_write_cdx_record (const char *url, const char *timestamp_str,
   timestamp_str_cdx[14] = '\0';
 
   /* Rewrite the checksum. */
-  const char *checksum;
   if (payload_digest != NULL)
     checksum = payload_digest + 5; /* Skip the "sha1:" */
   else
@@ -1258,10 +1274,10 @@ warc_write_revisit_record (char *url, char *timestamp_str,
                            char *refers_to, ip_address *ip, FILE *body)
 {
   char revisit_uuid [48];
-  warc_uuid_str (revisit_uuid);
-
   char *block_digest = NULL;
   char sha1_res_block[SHA1_DIGEST_SIZE];
+
+  warc_uuid_str (revisit_uuid);
   sha1_stream (body, sha1_res_block);
   block_digest = warc_base32_sha1_digest (sha1_res_block);
 
@@ -1311,6 +1327,8 @@ warc_write_response_record (char *url, char *timestamp_str,
   char *payload_digest = NULL;
   char sha1_res_block[SHA1_DIGEST_SIZE];
   char sha1_res_payload[SHA1_DIGEST_SIZE];
+  char response_uuid [48];
+  off_t offset;
 
   if (opt.warc_digests_enabled)
     {
@@ -1355,11 +1373,10 @@ warc_write_response_record (char *url, char *timestamp_str,
 
   /* Not a revisit, just store the record. */
 
-  char response_uuid [48];
   warc_uuid_str (response_uuid);
 
   fseeko (warc_current_file, 0L, SEEK_END);
-  off_t offset = ftello (warc_current_file);
+  offset = ftello (warc_current_file);
 
   warc_write_start_record ();
   warc_write_header ("WARC-Type", "response");
-- 
1.8.0

>From a618b9fdd57a9f2d49e1be6bb4e8ca9e88dd8e59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81ngel=20Gonz=C3=A1lez?= <[email protected]>
Date: Wed, 14 Nov 2012 18:56:36 +0100
Subject: [PATCH 2/2] Removed the mixed code and declarations left from warc.c

---
 src/warc.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/warc.c b/src/warc.c
index 04e11e4..2f4fe04 100644
--- a/src/warc.c
+++ b/src/warc.c
@@ -105,7 +105,7 @@ static int warc_current_file_number;
 struct hash_table * warc_cdx_dedup_table;
 
 static bool warc_start_new_file (bool meta);
-
+static bool warc_write_skip_length_header (void);
 
 struct warc_cdx_record
 {
@@ -300,6 +300,17 @@ warc_write_end_record (void)
       fflush (warc_current_file);
       fseeko (warc_current_file, 0, SEEK_END);
 
+      return warc_write_skip_length_header();
+    }
+#endif /* HAVE_LIBZ */
+
+  return warc_write_ok;
+}
+
+static bool
+warc_write_skip_length_header (void)
+{
+#ifdef HAVE_LIBZ
       /* The WARC standard suggests that we add 'skip length' data in the
          extra header field of the GZIP stream.
 
@@ -319,14 +330,16 @@ warc_write_end_record (void)
       off_t current_offset = ftello (warc_current_file);
       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
       off_t compressed_size = warc_current_gzfile_uncompressed_size;
+      char static_header[GZIP_STATIC_HEADER_SIZE];
+      char extra_header[EXTRA_GZIP_HEADER_SIZE];
+      size_t result;
 
       /* Go back to the static GZIP header. */
       fseeko (warc_current_file, warc_current_gzfile_offset
               + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 
       /* Read the header. */
-      char static_header[GZIP_STATIC_HEADER_SIZE];
-      size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
+      result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
                              warc_current_file);
       if (result != GZIP_STATIC_HEADER_SIZE)
         {
@@ -343,7 +356,7 @@ warc_write_end_record (void)
       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 
       /* Prepare the extra GZIP header. */
-      char extra_header[EXTRA_GZIP_HEADER_SIZE];
+
       /* XLEN, the length of the extra header fields.  */
       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
@@ -369,10 +382,8 @@ warc_write_end_record (void)
       /* Done, move back to the end of the file. */
       fflush (warc_current_file);
       fseeko (warc_current_file, 0, SEEK_END);
-    }
 #endif /* HAVE_LIBZ */
-
-  return warc_write_ok;
+  return true;
 }
 
 
-- 
1.8.0

Reply via email to