Hello, I wrote two patches regarding this issue based on your suggestions.
The 1st one is crucial for retrieve_from_file() function: it fixes 2
memory corruption bugs.
The 2nd patch is more of an experiment than a real deal and is
intended to get some feedback. It changes parallel download to one
temporary file which is located in the selected directory.
Before download starts it uses posix_fallocate() to allocate space and
mkstemp() for unique name. After download is completed it uses rename()
to rename temporary file to the final file name.
After posix_fallocate() each thread opens file with fopen("wb"). I
opted for fopen() rather than messing around with file descriptors
because I believe it's more portable. I don't know how Windows
would react to file descriptors and I don't have a proper Windows system
to test it out. Now, fopen("wb") means file, which was fallocate'd, is
truncated to zero but after first request from the thread, which is
reponsible for the last chunk, it would grow back to at least file_size
- chunk_size. I'm also not sure how devastating that is.
I'm attaching a handmade Metalink file which downloads a 50MB file for
testing purposes. Currently all threads connect to the same server and I
understand we don't support such behaviour but I guess 2-3 threads for
testing purpose don't hurt anyone. :)
I'm open for suggestions.
Regards,
Jure Grabnar
>From ed8acdbf66d74284d6688ad0ac69362bfdbc98a9 Mon Sep 17 00:00:00 2001 From: Jure Grabnar <[email protected]> Date: Wed, 7 May 2014 22:38:20 +0200 Subject: [PATCH 1/2] Fix bugs causing memory corruption. --- src/ChangeLog | 6 ++++++ src/multi.c | 2 +- src/retr.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 537a707..55a1278 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,9 @@ +2014-05-07 Jure Grabnar <[email protected]> + + * multi.c: Add condition to fix memory corruption and downloading in parallel + in general. + * retr.c: Increase buffer size by 1 for '\0' to avoid memory corruption. + 2014-03-26 Darshit Shah <[email protected]> * ftp.c (getftp): Rearrange parameters to fix compiler warning diff --git a/src/multi.c b/src/multi.c index 4b22b2e..43c2f73 100644 --- a/src/multi.c +++ b/src/multi.c @@ -153,7 +153,7 @@ fill_ranges_data(int num_of_resources, long long int file_size, for (r = 0; r < num_of_resources; ++r) ranges[i].resources[r] = false; ++i; - } while (ranges[i-1].last_byte < (file_size - 1)); + } while (i < opt.jobs && ranges[i-1].last_byte < (file_size - 1)); ranges[i-1].last_byte = file_size -1; return i; diff --git a/src/retr.c b/src/retr.c index 8c361de..2f45fa5 100644 --- a/src/retr.c +++ b/src/retr.c @@ -1250,7 +1250,7 @@ retrieve_from_file (const char *file, bool html, int *count) int res; /* Form the actual file to be downloaded and verify hash. */ file_path = malloc((opt.dir_prefix ? strlen(opt.dir_prefix) : 0) - + strlen(file->name) + (sizeof "/")); + + strlen(file->name) + (sizeof "/") + 1); if(opt.dir_prefix) sprintf(file_path, "%s/%s", opt.dir_prefix, file->name); else -- 1.9.2
>From 7b453b0c1c8355b538d9f7d8d040313a1d345d37 Mon Sep 17 00:00:00 2001 From: Jure Grabnar <[email protected]> Date: Wed, 7 May 2014 22:49:51 +0200 Subject: [PATCH 2/2] Change parallel download to one temporary file instead of multiple. --- src/ChangeLog | 16 +++++++++++ src/http.c | 12 ++++++--- src/multi.c | 87 ++++++++++++++++++++++++++--------------------------------- src/multi.h | 13 +++++---- src/retr.c | 17 ++++++------ 5 files changed, 79 insertions(+), 66 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 55a1278..2a310e9 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,5 +1,21 @@ 2014-05-07 Jure Grabnar <[email protected]> + * multi.c: Parallel download is now stored in one temporary file rather than + multiple files. + (SUFFIX_TEMP): Define. + (name_temp_files): Use function mkstemp() instead of tmpnam() which is safer + and allows for customized path. + (init_temp_files, name_temp_files, delete_temp_files, clean_temp_files): + Rewritten to work with one file. + (merge_temp_files): Remove. + (rename_temp_file): Add. + * multi.h: Add global variable (barrier) 'file_rdy_bar'. + * retr.c (retrieve_from_file): Change code to work with one temporary file. + * http.c (gethttp): Likewise. Use external barrier to sync threads after + fopen(). + +2014-05-07 Jure Grabnar <[email protected]> + * multi.c: Add condition to fix memory corruption and downloading in parallel in general. * retr.c: Increase buffer size by 1 for '\0' to avoid memory corruption. diff --git a/src/http.c b/src/http.c index 388530c..b46fc66 100644 --- a/src/http.c +++ b/src/http.c @@ -150,7 +150,7 @@ struct request { }; extern int numurls; - +extern pthread_barrier_t file_rdy_bar; /* Create a new, empty request. Set the request's method and its arguments. METHOD should be a literal string (or it should outlive the request) because it will not be freed. ARG will be freed by @@ -2784,7 +2784,7 @@ read_header: REGISTER_PERSISTENT_CONNECTION (4); return RETRUNNEEDED; } - else if (!ALLOW_CLOBBER) + else if (!ALLOW_CLOBBER && !opt.metalink_file) { char *unique = unique_name (hs->local_file, true); if (unique != hs->local_file) @@ -3167,7 +3167,13 @@ read_header: #endif /* def __VMS [else] */ /* Open the local file. */ - if (!output_stream) + if (opt.jobs > 1) + { + fp = fopen (hs->local_file, "wb"); + fseek (fp, hs->restval, SEEK_SET); + pthread_barrier_wait (&file_rdy_bar); + } + else if (!output_stream) { mkalldirs (hs->local_file); if (opt.backups) diff --git a/src/multi.c b/src/multi.c index 43c2f73..665b50e 100644 --- a/src/multi.c +++ b/src/multi.c @@ -36,90 +36,79 @@ as that of the covered work. */ #include <pthread.h> #include <semaphore.h> #include <unistd.h> +#include <fcntl.h> #include "multi.h" #include "url.h" + +/* Suffix for temporary files. Last 6 chars must be 'X' because of mkstemp(). */ +#define SUFFIX_TEMP ".tmp.XXXXXX" + static struct range *ranges; -char **files; +char *main_file; /* Allocate space for temporary file names. */ void -init_temp_files() +init_temp_files(char *file_name) { - int i; + int alloc_size = (opt.dir_prefix ? strlen (opt.dir_prefix) + (sizeof "/") : 0) + + strlen (file_name) + (sizeof SUFFIX_TEMP) + 1; - if(!(files = malloc (opt.jobs * (sizeof *files)))) + if(!(main_file = malloc (alloc_size))) { - logprintf (LOG_VERBOSE, "Space for temporary file data could not be allocated.\n"); + logprintf (LOG_VERBOSE, "Space for temporary file names could not be allocated.\n"); exit(1); } - for (i = 0; i < opt.jobs; ++i) - if(!(files[i] = malloc (L_tmpnam * sizeof(char)))) - { - logprintf (LOG_VERBOSE, "Space for temporary file names could not be allocated.\n"); - exit(1); - } } /* Assign names to temporary files to be used. */ void -name_temp_files() +name_temp_files(char *file_name, long long int file_size) { - int i; + int fd; - for (i = 0; i < opt.jobs; ++i) - if(!tmpnam(files[i])) - { - logprintf (LOG_VERBOSE, "Temporary file name could not be assigned.\n"); - exit(1); - } + if(opt.dir_prefix) + sprintf(main_file, "%s/%s%s", opt.dir_prefix, file_name, SUFFIX_TEMP); + else + sprintf(main_file, "%s%s", file_name, SUFFIX_TEMP); + + if(!(fd = mkstemp (main_file))) + { + logprintf (LOG_VERBOSE, "Temporary file name could not be assigned.\n"); + exit(1); + } + + if (posix_fallocate(fd, 0, file_size)) + { + logprintf (LOG_VERBOSE, "File could not be allocated.\n"); + exit(1); + } + close (fd); } -/* Merge the temporary files in which the chunks are stored to form the - resulting file(output). */ +/* Rename the temporary file used to the final file name. */ void -merge_temp_files(char *output) +rename_temp_file (char *new_file_name) { - FILE *out, *in; - int j, ret; - void *buf = malloc (MIN_CHUNK_SIZE); + rename (main_file, new_file_name); - out = fopen (output, "wb"); - for(j = 0; j < opt.jobs; ++j) - { - in = fopen(files[j],"rb"); - ret = MIN_CHUNK_SIZE; - while(ret == MIN_CHUNK_SIZE) - { - ret = fread(buf, 1, MIN_CHUNK_SIZE, in); - fwrite(buf, 1, ret, out); - } - fclose(in); - } - fclose(out); - free(buf); + free (main_file); + main_file = xstrdup (new_file_name); } /* Delete the temporary files used. */ void delete_temp_files() { - int j = 0; - - while(j < opt.jobs) - unlink(files[j++]); + unlink (main_file); } /* Clean the space allocated for temporary files data. */ void clean_temp_files() { - int i; - - for (i = 0; i < opt.jobs; ++i) - free (files[i]); - free(files); + free (main_file); } /* Allocate ranges array to store the ranges data. */ @@ -188,7 +177,7 @@ spawn_thread (struct s_thread_ctx *thread_ctx, int index, int resource) if(!thread_ctx[index].url_parsed) return 1; - thread_ctx[index].file = files[index]; + thread_ctx[index].file = main_file; thread_ctx[index].range = ranges + index; (thread_ctx[index].range)->is_assigned = 1; (thread_ctx[index].range)->resources[resource] = true; diff --git a/src/multi.h b/src/multi.h index ad9bd21..212f28f 100644 --- a/src/multi.h +++ b/src/multi.h @@ -32,6 +32,7 @@ as that of the covered work. */ #define MULTI_H #include <semaphore.h> +#include <pthread.h> #include "wget.h" @@ -62,15 +63,17 @@ struct s_thread_ctx uerr_t status; }; -void init_temp_files(); +pthread_barrier_t file_rdy_bar; -void name_temp_files(); +void init_temp_files(char *); -void merge_temp_files(char *); +void name_temp_files(char *, long long int); -void delete_temp_files(); +void rename_temp_file (char *); -void clean_temp_files(); +void delete_temp_files (); + +void clean_temp_files (); void init_ranges(); diff --git a/src/retr.c b/src/retr.c index 2f45fa5..20c0cc7 100644 --- a/src/retr.c +++ b/src/retr.c @@ -1087,7 +1087,7 @@ retrieve_from_file (const char *file, bool html, int *count) elect_resources (mlink); elect_checksums (mlink); - init_temp_files(); + init_temp_files(mlink->files->name); init_ranges (); thread_ctx = malloc (opt.jobs * (sizeof *thread_ctx)); @@ -1108,9 +1108,11 @@ retrieve_from_file (const char *file, bool html, int *count) if (j < opt.jobs) opt.jobs = j; - name_temp_files (); + name_temp_files (file->name, file->size); sem_init (&retr_sem, 0, 0); + pthread_barrier_init (&file_rdy_bar, NULL, opt.jobs); + j = ranges_covered = 0; resource = file->resources; @@ -1255,10 +1257,10 @@ retrieve_from_file (const char *file, bool html, int *count) sprintf(file_path, "%s/%s", opt.dir_prefix, file->name); else sprintf(file_path, "%s", file->name); - mkalldirs(file_path); - merge_temp_files(file_path); + + rename_temp_file (file_path); res = verify_file_hash(file_path, file->checksums); - free(file_path); + free (file_path); if(!res) { ++*count; @@ -1274,13 +1276,12 @@ retrieve_from_file (const char *file, bool html, int *count) logprintf (LOG_VERBOSE, _("Retrying to download(%s). (TRY #%d)\n"), file->name, ++retries + 1); + delete_temp_files(); continue; } } } - delete_temp_files(); - clean_range_res_data(); if (opt.quota && total_downloaded_bytes > opt.quota) { @@ -1292,7 +1293,6 @@ retrieve_from_file (const char *file, bool html, int *count) free(thread_ctx); clean_ranges (); - clean_temp_files (); delete_mlink(mlink); } else @@ -1481,7 +1481,6 @@ rotate_backups(const char *fname) sprintf (from, "%s%s%d", fname, SEP, i - 1); rename (from, to); } - sprintf (to, "%s%s%d", fname, SEP, 1); rename(fname, to); } -- 1.9.2
50mb.meta3
Description: Binary data
