Hi Amos, Samuel, The Hurd Team,

this a follow up of my previous email that strive to handle HTTP redirect.

There are two files: one adds simply the User-Agent because some
websites does not like HTTP/1.0 requests without an User-Agent header;
the second one implements a simple redirect mechanism if the first
HEAD request returns a response with a Location header.

Tried with:

settrans -facg /tmp/site ./httpfs -D -L 1 gnu.org/
In the HTML parser for parsing tmp
Connecting to gnu.org via gnu.org:80
HTTP Protocol Verified. Status: 301
Connecting to www.gnu.org via www.gnu.org:80
HTTP Protocol Verified. Status: 200
entering the main loop

ls -1 /tmp/site/
filling out dir tmp
index.html

cat /tmp/site/index.html
Connecting to www.gnu.org via www.gnu.org:80
HTTP Protocol Verified. Status: 200

I have a question: in the next patch shall I focus on removing the
HEAD and using only a GET ? Because this patch does not handle the
case if eventually the GET request replies with a Location header.

PS: the result after ls command is much longer but i cut it off for brevity.

Sincerely,

Gianluca
Index: httpfs/http.c
===================================================================
--- httpfs.orig/http.c
+++ httpfs/http.c
@@ -225,7 +225,7 @@ error_t open_connection(struct netnode *
        }
 
        /* Send a HEAD request find header length */
-       sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\n\r\n",node->conn_req,node->url);
+       sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
        towrite = strlen (buffer);
        written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
        if ( written == -1 || written < towrite )
@@ -389,7 +389,7 @@ error_t fill_dirnode (struct netnode *di
                                        strcat(conn_req,"/");
                        }
                        comm_buf=(char *)malloc((strlen(conn_req)+20)*sizeof(char));
-                       sprintf(comm_buf,"GET %s HTTP/1.0\r\nHost: %s\r\n\r\n",conn_req,url);
+                       sprintf(comm_buf,"GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",conn_req,url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
 
                        nd = httpfs_make_node (go->f_type,url,conn_req,comm_buf,f_name);
                        if (!nd)
Index: httpfs/args.c
===================================================================
--- httpfs.orig/args.c
+++ httpfs/args.c
@@ -40,6 +40,7 @@ static const struct argp_option options[
   {"proxy",'X',"STRING",0,"Specify IP address of proxy server"},
   {"port",'P',"NUMBER",0,"Specify a non-standard port"},
   {"mode",'M',"STRING",0,"Set to directory or file (--mode=dir or --mode=file)"},
+  {"location",'L',"MAX",0,"The maximum number of redirects to follow"},
   {0}
 };
 
@@ -78,6 +79,11 @@ static error_t parse_opt (int opt, char
        else
                return ARGP_ERR_UNKNOWN;
        break;
+   case 'L':
+       max_redirects = atoi (arg);
+       if (max_redirects < 0)
+           argp_error (state, "Invalid redirect limit: %s", arg);
+       break;
    case ARGP_KEY_ARG:
       url = arg;
       break;
Index: httpfs/httpfs.c
===================================================================
--- httpfs.orig/httpfs.c
+++ httpfs/httpfs.c
@@ -38,6 +38,7 @@ char *url, *conn_req;
 char *ip_addr;
 char *dir_tok[25];
 struct files *list_of_entries = NULL, *this_entry;
+int max_redirects = 0;
 
 struct httpfs *httpfs;         /* filesystem global pointer */
 volatile struct mapped_time_value *httpfs_maptime;
Index: httpfs/httpfs.h
===================================================================
--- httpfs.orig/httpfs.h
+++ httpfs/httpfs.h
@@ -52,6 +52,9 @@ extern int no_of_slashes;
  * like www.gnu.org/gpl.html and www.gnu.org/  no file given so index.html */
 extern char *dir_tok[25];
 
+/* The maximum number of redirects to follow */
+extern int max_redirects;
+
 
 /* handle all initial parameter parsing */
 error_t httpfs_parse_args (int argc, char **argv);
Index: httpfs/http.c
===================================================================
--- httpfs.orig/http.c
+++ httpfs/http.c
@@ -187,72 +187,142 @@ error_t open_connection(struct netnode *
        size_t towrite;
        char buffer[4096];
        ssize_t bytes_read;
+       int redirects_followed = 0;
 
-       /* 1. Target selection.
-        * If ip_addr (proxy global variable) is set, we use it. 
-        * Otherwise we use the node URL.
-        */
-       const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ? ip_addr : node->url;
-
-       /* 2. Agnostic resolution (IPv4/IPv6) */
-       if ((err = lookup_host (target_host, &server_addr, &addr_len, &sock_type, &protocol)) != 0) {
-               fprintf (stderr, "Cannot resolve host: %s\n", target_host);
-               return err;
-       }
+       while (1) {
+               if (redirects_followed > max_redirects)
+                       return ELOOP;
+
+               /* 1. Target selection.
+                * If ip_addr (proxy global variable) is set, we use it. 
+                * Otherwise we use the node URL.
+                */
+               const char *target_host = (strcmp (ip_addr, "0.0.0.0") != 0) ? ip_addr : node->url;
+
+               /* 2. Agnostic resolution (IPv4/IPv6) */
+               if ((err = lookup_host (target_host, &server_addr, &addr_len, &sock_type, &protocol)) != 0) {
+                       fprintf (stderr, "Cannot resolve host: %s\n", target_host);
+                       return err;
+               }
 
-       /* 3. Set of the port. */
-       if (server_addr.ss_family == AF_INET) {
-               ((struct sockaddr_in *)&server_addr)->sin_port = htons (port);
-       } else if (server_addr.ss_family == AF_INET6) {
-               ((struct sockaddr_in6 *)&server_addr)->sin6_port = htons (port);
-       }
+               /* 3. Set of the port. */
+               if (server_addr.ss_family == AF_INET) {
+                       ((struct sockaddr_in *)&server_addr)->sin_port = htons (port);
+               } else if (server_addr.ss_family == AF_INET6) {
+                       ((struct sockaddr_in6 *)&server_addr)->sin6_port = htons (port);
+               }
 
-       if (debug_flag)
-               fprintf (stderr, "Connecting to %s via %s:%d\n", node->url, target_host, port);
+               if (debug_flag)
+                       fprintf (stderr, "Connecting to %s via %s:%d\n", node->url, target_host, port);
 
-       /* 4. First connection: HEAD request */
-       *fd = socket (server_addr.ss_family, sock_type, protocol);
-       if (*fd == -1)
-       {
-               perror ("Socket creation error for HEAD request");
-               return errno;
-       }
+               /* 4. First connection: HEAD request */
+               *fd = socket (server_addr.ss_family, sock_type, protocol);
+               if (*fd == -1)
+               {
+                       perror ("Socket creation error for HEAD request");
+                       return errno;
+               }
 
-       if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == -1) {
-               perror ("Connection to remote host failed");
-               close (*fd);
-               return errno;
-       }
+               if (connect (*fd, (struct sockaddr *)&server_addr, addr_len) == -1) {
+                       perror ("Connection to remote host failed");
+                       close (*fd);
+                       return errno;
+               }
 
-       /* Send a HEAD request find header length */
-       sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
-       towrite = strlen (buffer);
-       written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
-       if ( written == -1 || written < towrite )
-       {
-               fprintf(stderr,"Could not send an HTTP request to host\n");
-               return errno;
-       }
+               /* Send a HEAD request find header length */
+               sprintf(buffer,"HEAD %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",node->conn_req,node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
+               towrite = strlen (buffer);
+               written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
+               if ( written == -1 || written < towrite )
+               {
+                       fprintf(stderr,"Could not send an HTTP request to host\n");
+                       return errno;
+               }
+
+               /* Check HTTP status code and handle other than 200 OK only */
+               err = translate_http_status (*fd, &bytes_read);
+
+               /* Follow a redirect up to max_redirects */
+               if (err == EAGAIN) {
+                       /* Read the HEAD response headers line by line and find Location: string */
+                       char line[1024];
+                       char *new_url = NULL;
+                       ssize_t nheader;
+
+                       while (1) {
+                               size_t i = 0;
+                               char c;
+                               while (i < sizeof (line) - 1) {
+                                       if (read (*fd, &c, 1) <= 0) break;
+                                       line[i++] = c;
+                                       if (c == '\n') break;
+                               }
+
+                               line[i] = '\0';
+
+                               if (line[0] == '\r' || line[0] == '\n' || i == 0) break;
+
+                               if (strncasecmp (line, "Location:", 9) == 0) {
+                                       char *url_start = line + 9;
+                                       while (*url_start == ' ' || *url_start == '\t') url_start++;
+
+                                       char *url_end = strpbrk (url_start, "\r\n");
+                                       if (url_end) *url_end = '\0';
+
+                                       new_url = strdup (url_start);
+                               }
+                       }
+
+                       close (*fd);
+
+                       if (new_url) {
+                               if (strncasecmp (new_url, "https://";, 8) == 0) {
+                                       free (new_url);
+                                       return EPROTO;
+                               }
+
+                               char *host = new_url;
+                               if (strncasecmp (new_url, "http://";, 7) == 0) host = host + 7;
+
+                               char *slash = strchr (host, '/');
+
+                               if (node->url) free (node->url);
+                               if (node->conn_req) free (node->conn_req);
+
+                               if (slash) {
+                                       node->url = strndup (host, slash - host);
+                                       node->conn_req = strdup (slash);
+                               } else {
+                                       node->url = strdup (host);
+                                       node->conn_req = strdup ("/");
+                               }
+
+                               if (node->comm_buf) free (node->comm_buf);
+                               asprintf (&node->comm_buf, "GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",
+                                               node->conn_req, node->url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
+
+                               free (new_url);
+                               redirects_followed++;
+                               continue;
+                       }
+
+                       return EPROTO;
+               }
+
+               if (err != 0) {
+                       close (*fd);
+                       return err;
+               }
+
+               int n = read (*fd, buffer, sizeof (buffer));
+               if (n >= 0) {
+                       buffer[n] = '\0';
+                       *head_len = bytes_read + n;
+               }
 
-       /* Check HTTP status code and handle other than 200 OK only */
-       if ((err = translate_http_status (*fd, &bytes_read)) != 0)
-       {
-               close (*fd);
-               return err;
-       }
-       
-       int n = read(*fd,buffer,sizeof(buffer));
-       if ( n < 0 ) 
-       {
-               perror ("Failed to read HEAD response");
                close (*fd);
-               return errno;
+               break;
        }
-       buffer[n] = '\0';
-
-       *head_len = bytes_read + n;
-       
-       close(*fd);
        
        /* 5. Second connection: GET request */
        /* Send the GET request for the url */
@@ -288,126 +358,76 @@ error_t fill_dirnode (struct netnode *di
        error_t err = 0;
        struct node *nd, **prevp;
        struct files *go;
-       char *comm_buf,*url,*conn_req,*f_name,*temp,*temp1;
+       char *comm_buf = NULL, *url = NULL, *conn_req = NULL, *f_name = NULL;
 
        if (debug_flag)
                fprintf (stderr, "filling out dir %s\n", dir->file_name);
-       
-       if ( dir->type == HTTP_DIR_NOT_FILLED ) {
-               /* it is an unfilled directory so send a GET request for that
-                * directory and parse the incoming HTML stream to get the file 
-                * and directories within that
-                * and Fill the intermediate data-structure *file */
-               err = parse(dir);
-               if ( err )
-                       return err;
+
+       if (dir->type == HTTP_DIR_NOT_FILLED) {
+               err = parse (dir);
+               if (err) return err;
                dir->type = HTTP_DIR;
        }
 
-       
        dir->noents = TRUE;
        dir->num_ents = 0;
        prevp = &dir->ents;
-       
-       for(go=list_of_entries;go!=NULL;go=go->next)
-       {
-               /* *file linked list contains all the file info obtained from
-                * parsing the <a href="..">
-                * select the ones belonging to this particular directory
-                * and fill its node */
-               
-               if(strcmp(dir->file_name,go->parent)==0)
-               {
-                       /* got a file in this directory 
-                        * directory under consideration is dir->file_name
-                        * so have to fetch all files whose parent is
-                        * dir->file_name, i.e. dir->file_name==go->parent */
-                       
-                       if ( go->f_type == HTTP_URL ) 
-                       {
-                               /* its an url 
-                                * url is shown as regular file 
-                                * its name is altered by changing / to .
-                                * www.gnu.org/gpl.html will be changed to
-                                * www.gnu.org.gpl.html */
-                               char *slash;
-                               conn_req=(char *)malloc((strlen(go->f_name)+8)*sizeof(char));
-                               slash = strchr(go->f_name, '/');
-                               if (slash)
-                                       url = strndup(go->f_name, slash - go->f_name);
-                               else
-                                       url = strdup(go->f_name);
-                               f_name = strdup(go->f_name);
-                               int i;
-                               for (i = 0; f_name[i] != '\0'; i++)
-                                       if (f_name[i] == '/')
-                                               f_name[i] = '.';
-                               
-                               sprintf(conn_req,"%s%s","http://",go->f_name);
-                       }
-                       else 
-                       {       
-                               /* its not an url */
-                               f_name = strdup(go->f_name);
-                               url=strdup(dir->url);
-                               if ( go != list_of_entries )
-                               {
-                                       size_t conn_req_size = strlen(dir->conn_req) + strlen(go->f_name) + 1;
-                                       if( go->f_type==HTTP_DIR || go->f_type==HTTP_DIR_NOT_FILLED )
-                                               conn_req_size++; /* We'll need to add a trailing slash later. */
-                                       conn_req=(char *)malloc(conn_req_size*sizeof(char));
-                                       sprintf(conn_req,"%s%s",dir->conn_req,go->f_name);
-                               }
-                               else
-                               {
-                                       if ( dir_tok[no_of_slashes] == NULL ) 
-                                       {
-                                               /* the file corresponding to base url
-                                                * user has given a file explicitly in
-                                                * the url */
-                                               size_t conn_req_size = strlen(dir->conn_req) + strlen(go->f_name) + 1;
-                                               if( go->f_type==HTTP_DIR || go->f_type==HTTP_DIR_NOT_FILLED )
-                                                       conn_req_size++; /* We'll need to add a trailing slash later. */
-                                               conn_req=(char *)malloc(conn_req_size*sizeof(char));
-                                               sprintf(conn_req,"%s%s",dir->conn_req,go->f_name);
-                                       }
-                                       else 
-                                       {
-                                               /* the file corresponding to base url
-                                                * user has not given a file explicitly 
-                                                * the url so its the index.html */
-                                               size_t conn_req_size = strlen(dir->conn_req) + 1;
-                                               if( go->f_type==HTTP_DIR || go->f_type==HTTP_DIR_NOT_FILLED )
-                                                       conn_req_size++; /* We'll need to add a trailing slash later. */
-                                               conn_req=(char *)malloc(conn_req_size*sizeof(char));
-                                               sprintf(conn_req,"%s",dir->conn_req);
-                                       }
-                               }
-                               if( go->f_type==HTTP_DIR || go->f_type==HTTP_DIR_NOT_FILLED ) 
-                                       /* the filled file is directory so it has to end
-                                        * with a / */
-                                       strcat(conn_req,"/");
-                       }
-                       comm_buf=(char *)malloc((strlen(conn_req)+20)*sizeof(char));
-                       sprintf(comm_buf,"GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",conn_req,url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION);
 
-                       nd = httpfs_make_node (go->f_type,url,conn_req,comm_buf,f_name);
-                       if (!nd)
-                       {
-                               err = ENOMEM;
-                               return err;
-                       }
-                       free(comm_buf);
-                       free(conn_req);
-                       free(f_name);
-                       *prevp = nd;
-                       nd->prevp = prevp;
-                       prevp = &nd->next;
-                       dir->num_ents++;
-                       if (dir->noents)
-                               dir->noents = FALSE;
-               }
-       }
-       return err;
+       for (go = list_of_entries; go != NULL; go = go->next) {
+        if (strcmp(dir->file_name, go->parent) == 0) {
+            
+           /* Handle URL */
+            if (go->f_type == HTTP_URL) {
+                char *slash = strchr(go->f_name, '/');
+                url = slash ? strndup(go->f_name, slash - go->f_name) : strdup(go->f_name);
+                f_name = strdup(go->f_name);
+                
+                for (int i = 0; f_name[i] != '\0'; i++)
+                    if (f_name[i] == '/') f_name[i] = '.';
+                
+                if (asprintf(&conn_req, "http://%s";, go->f_name) < 0) return ENOMEM;
+            } 
+            else {
+               /* Handle Local File/Directories */
+                f_name = strdup(go->f_name);
+                url = strdup(dir->url);
+                
+               /* Build conn_req if it is a root element or a sub-element */
+                if (go != list_of_entries) {
+                    if (asprintf(&conn_req, "%s%s%s", dir->conn_req, go->f_name, 
+                        (go->f_type == HTTP_DIR || go->f_type == HTTP_DIR_NOT_FILLED) ? "/" : "") < 0) 
+                        return ENOMEM;
+                } else {
+                   /* Base URL */
+                    if (asprintf(&conn_req, "%s%s", dir->conn_req, 
+                        (go->f_type == HTTP_DIR || go->f_type == HTTP_DIR_NOT_FILLED) ? "/" : "") < 0) 
+                        return ENOMEM;
+                }
+            }
+
+           /* Build comm_buf safely */
+            if (asprintf(&comm_buf, "GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s/%s\r\n\r\n",
+                         conn_req, url, HTTPFS_SERVER_NAME, HTTPFS_SERVER_VERSION) < 0) {
+                return ENOMEM;
+            }
+
+            nd = httpfs_make_node (go->f_type, url, conn_req, comm_buf, f_name);
+            
+           /* Final cleanup */
+            free(comm_buf); comm_buf = NULL;
+            free(conn_req); conn_req = NULL;
+            free(url);      url = NULL;
+            free(f_name);   f_name = NULL;
+
+            if (!nd) return ENOMEM;
+
+            *prevp = nd;
+            nd->prevp = prevp;
+            prevp = &nd->next;
+            dir->num_ents++;
+            dir->noents = FALSE;
+        }
+    }
+    return 0;
 }

Reply via email to