Re: [PATCH 1/3] httpfs: Implement argz-based URL parsing

Samuel Thibault Tue, 13 Jan 2026 12:53:27 -0800

Hello,

I don't see the point of going into argz-based URL, what benefit will we
get? It makes operations more expensive everywhere.


Apart from that argz-rewrite, I see a lot of other, unrelated rewrites,
without an explanation. I cannot review that. We really need separate
commits, one per step that compiles & works with its own rationale.

Of course, preparatory patches can take sense through the patches that
come after, but here I'm really clueless as what the benefit will be.

Gianluca Cannata, le mar. 13 janv. 2026 15:46:55 +0100, a ecrit:
> @@ -117,7 +118,7 @@ void extract(char *_string,char *parent)
>                       strcpy(temp,strchr(temp,'/'));
>                       temp++;
>               }
> -             if ( no_of_slashes_here < no_of_slashes ) 
> +             if ( no_of_slashes_here < argz_count (dir_tok, dir_tok_len) ) 

This would make use re-parse dir_tok every time. Better continue to keep
counting no_of_slashes in main, and use it.

>               {
>                       /* not going to support top level directory
>                        * from a given directory
> @@ -137,7 +138,12 @@ void extract(char *_string,char *parent)
>               }
>  
>               token = strdupa(string);
> -             for ( i=0 ; i<no_of_slashes-1 ; i++ ) 
> +
> +             int base_depth = argz_count (dir_tok, dir_tok_len);
> +             char *current_dir_tok_part = dir_tok;
> +
> +
> +             for ( i=0 ; i<(base_depth-1) ; i++ ) 
>               {
>                       /* extract file and directory from the string
>                        * like /file/temp/a.html
> @@ -145,11 +151,19 @@ void extract(char *_string,char *parent)
>                        * extract temp fill it as a directory under file/
>                        * extract a.html fill it as a file under temp/ */
>                       
> -                     temp = strdupa(token); 
> -                     if ( strcmp(dir_tok[i],strtok(temp,"/")) )
> -                             return;
> -                     strcpy(token,strchr(token,'/'));
> -                     token++;
> +                     temp = strdupa(token);
> +                     char *current_token = strtok (temp, "/");
> +
> +                     if (current_dir_tok_part && current_token)
> +                             if (strcmp (current_dir_tok_part, 
> current_token) != 0)
> +                                     return;
> +
> +                     char *next_slash = strchr (token, '/');
> +                     if (next_slash)
> +                             token = next_slash + 1;
> +
> +                     current_dir_tok_part = argz_next (dir_tok, dir_tok_len, 
> current_dir_tok_part);

Again, that would make us re-parse dir_tok each time.


> +
>               }
>               parent = strdupa("tmp");
>               string = strdupa(token);
> Index: httpfs/http.c
> ===================================================================
> --- httpfs.orig/http.c
> +++ httpfs/http.c
> @@ -28,12 +28,24 @@
>  #include <string.h>
>  #include <malloc.h>
>  #include <stddef.h>
> +#include <argz.h>
>  
>  #include "httpfs.h"
>  
>  #include <hurd/hurd_types.h>
>  #include <hurd/netfs.h>
>  
> +/* Extracts the relative path. If it receives "/" returns "/" */
> +static const char *get_safe_path(const char *req) {
> +    if (!req || strlen(req) == 0) return "/";
> +    const char *p = strstr(req, "://");
> +    if (p) {
> +        p = strchr(p + 3, '/');
> +        return p ? p : "/";
> +    }
> +    return (req[0] == '/') ? req : "/";
> +}
> +
>  /* do a DNS lookup for NAME and store result in *ENT */
>  error_t lookup_host (char *url, struct hostent **ent) 
>  {
> @@ -63,6 +75,8 @@ error_t open_connection(struct netnode *
>       char delimiters0[] = " ";
>       char delimiters1[] = "\n";
>  
> +     const char *relative_path = get_safe_path (node->conn_req);
> +
>       bzero(&dest,sizeof(dest));
>       dest.sin_family = AF_INET;
>       dest.sin_port = htons (port);
> @@ -71,7 +85,7 @@ error_t open_connection(struct netnode *
>       {
>               /* connection is not through a proxy server
>                * find IP addr. of remote server */
> -             err = lookup_host (node->url, &hptr);
> +             err = lookup_host (dir_tok, &hptr);
>               if (err)
>               {
>                       fprintf(stderr,"Could not find IP addr %s\n",node->url);
> @@ -92,8 +106,8 @@ error_t open_connection(struct netnode *
>       }
>  
>       if (debug_flag)
> -             fprintf (stderr, "trying to open %s:%d/%s\n", node->url,
> -                             port, node->conn_req);
> +             fprintf (stderr, "trying to open %s:%d%s\n", dir_tok,
> +                             port, relative_path);
>  
>       *fd = socket (AF_INET, SOCK_STREAM, 0);
>       if (*fd == -1)
> @@ -110,7 +124,7 @@ error_t open_connection(struct netnode *
>       }
>  
>       /* Send a HEAD request find header length */
> -     sprintf(buffer,"HEAD %s HTTP/1.0\n\n",node->conn_req);
> +     sprintf(buffer,"HEAD %s HTTP/1.0\n\n", relative_path);
>       towrite = strlen (buffer);
>       written = TEMP_FAILURE_RETRY (write (*fd, buffer, towrite));
>       if ( written == -1 || written < towrite )
> @@ -177,19 +191,17 @@ error_t fill_dirnode (struct netnode *di
>       error_t err = 0;
>       struct node *nd, **prevp;
>       struct files *go;
> -     char *comm_buf,*url,*conn_req,*f_name,*temp,*temp1;
>  
>       if (debug_flag)
>               fprintf (stderr, "filling out dir %s\n", dir->file_name);
>       
>       if ( dir->type == HTTP_DIR_NOT_FILLED ) {
> -             /* it is an unfilled directory so send a GET request for that
> -              * directory and parse the incoming HTML stream to get the file 
> -              * and directories within that
> -              * and Fill the intermediate data-structure *file */
>               err = parse(dir);
> -             if ( err )
> -                     return err;
> +             if ( err ) {
> +                     fprintf (stderr, "httpfs: Parse error for %s\n", 
> dir->file_name);
> +                     dir->type = HTTP_DIR;
> +                     return 0;
> +             }
>               dir->type = HTTP_DIR;
>       }
>  
> @@ -200,103 +212,51 @@ error_t fill_dirnode (struct netnode *di
>       
>       for(go=list_of_entries;go!=NULL;go=go->next)
>       {
> -             /* *file linked list contains all the file info obtained from
> -              * parsing the <a href="..">
> -              * select the ones belonging to this particular directory
> -              * and fill its node */
> -             
> -             if(strcmp(dir->file_name,go->parent)==0)
> -             {
> -                     /* got a file in this directory 
> -                      * directory under consideration is dir->file_name
> -                      * so have to fetch all files whose parent is
> -                      * dir->file_name, i.e. dir->file_name==go->parent */
> -                     
> -                     if ( go->f_type == HTTP_URL ) 
> -                     {
> -                             /* its an url 
> -                              * url is shown as regular file 
> -                              * its name is altered by changing / to .
> -                              * www.gnu.org/gpl.html will be changed to
> -                              * www.gnu.org.gpl.html */
> -                             char *slash;
> -                             conn_req=(char 
> *)malloc((strlen(go->f_name)+8)*sizeof(char));
> -                             slash = strchr(go->f_name, '/');
> -                             if (slash)
> -                                     url = strndup(go->f_name, slash - 
> go->f_name);
> +             /* We filter the host itself to avoid infinite recursions */
> +             if (strcmp (go->f_name, dir_tok) == 0) continue;
> +
> +             if (strcmp (dir->file_name, go->parent) == 0) {
> +                     char *f_name = strdup (go->f_name);
> +                     char *url = strdup (dir_tok);
> +                     char *conn_req;
> +
> +                     /* Making of conn_req (the complete path for the server 
> ) */
> +                     if (go->f_type == HTTP_URL) {
> +                             asprintf (&conn_req, "http://%s";, go->f_name);
> +                     } else {
> +                             /* Concat the path of the father with the name 
> of the file */
> +                             const char *p_path = dir->conn_req;
> +                             int p_len = strlen (p_path);
> +
> +                             if (p_path[p_len - 1] == '/')
> +                                     asprintf (&conn_req, "%s%s", p_path, 
> go->f_name);
>                               else
> -                                     url = strdup(go->f_name);
> -                             f_name = strdup(go->f_name);
> -                             int i;
> -                             for (i = 0; f_name[i] != '\0'; i++)
> -                                     if (f_name[i] == '/')
> -                                             f_name[i] = '.';
> -                             
> -                             sprintf(conn_req,"%s%s","http://",go->f_name);
> +                                     asprintf (&conn_req, "%s/%s", p_path, 
> go->f_name);
>                       }
> -                     else 
> -                     {       
> -                             /* its not an url */
> -                             f_name = strdup(go->f_name);
> -                             url=strdup(dir->url);
> -                             if ( go != list_of_entries )
> -                             {
> -                                     size_t conn_req_size = 
> strlen(dir->conn_req) + strlen(go->f_name) + 1;
> -                                     if( go->f_type==HTTP_DIR || 
> go->f_type==HTTP_DIR_NOT_FILLED )
> -                                             conn_req_size++; /* We'll need 
> to add a trailing slash later. */
> -                                     conn_req=(char 
> *)malloc(conn_req_size*sizeof(char));
> -                                     
> sprintf(conn_req,"%s%s",dir->conn_req,go->f_name);
> -                             }
> -                             else
> -                             {
> -                                     if ( dir_tok[no_of_slashes] == NULL ) 
> -                                     {
> -                                             /* the file corresponding to 
> base url
> -                                              * user has given a file 
> explicitly in
> -                                              * the url */
> -                                             size_t conn_req_size = 
> strlen(dir->conn_req) + strlen(go->f_name) + 1;
> -                                             if( go->f_type==HTTP_DIR || 
> go->f_type==HTTP_DIR_NOT_FILLED )
> -                                                     conn_req_size++; /* 
> We'll need to add a trailing slash later. */
> -                                             conn_req=(char 
> *)malloc(conn_req_size*sizeof(char));
> -                                             
> sprintf(conn_req,"%s%s",dir->conn_req,go->f_name);
> -                                     }
> -                                     else 
> -                                     {
> -                                             /* the file corresponding to 
> base url
> -                                              * user has not given a file 
> explicitly 
> -                                              * the url so its the 
> index.html */
> -                                             size_t conn_req_size = 
> strlen(dir->conn_req) + 1;
> -                                             if( go->f_type==HTTP_DIR || 
> go->f_type==HTTP_DIR_NOT_FILLED )
> -                                                     conn_req_size++; /* 
> We'll need to add a trailing slash later. */
> -                                             conn_req=(char 
> *)malloc(conn_req_size*sizeof(char));
> -                                             
> sprintf(conn_req,"%s",dir->conn_req);
> -                                     }
> +
> +                     /* If it is a directory, assure it ends up with a slash 
> */
> +                     if (go->f_type == HTTP_DIR || go->f_type == 
> HTTP_DIR_NOT_FILLED) {
> +                             if (conn_req[strlen (conn_req) - 1] != '/') {
> +                                     char *tmp;
> +                                     asprintf (&tmp, "%s/", conn_req);
> +                                     free (conn_req);
> +                                     conn_req = tmp;
>                               }
> -                             if( go->f_type==HTTP_DIR || 
> go->f_type==HTTP_DIR_NOT_FILLED ) 
> -                                     /* the filled file is directory so it 
> has to end
> -                                      * with a / */
> -                                     strcat(conn_req,"/");
>                       }
> -                     comm_buf=(char 
> *)malloc((strlen(conn_req)+20)*sizeof(char));
> -                     sprintf(comm_buf,"GET %s HTTP/1.0",conn_req);
>  
> -                     nd = httpfs_make_node 
> (go->f_type,url,conn_req,comm_buf,f_name);
> -                     if (!nd)
> -                     {
> -                             err = ENOMEM;
> -                             return err;
> -                     }
> -                     free(comm_buf);
> -                     free(conn_req);
> -                     free(f_name);
> -                     *prevp = nd;
> -                     nd->prevp = prevp;
> -                     prevp = &nd->next;
> -                     dir->num_ents++;
> -                     if (dir->noents)
> +                     nd = httpfs_make_node (go->f_type, url, conn_req, "", 
> f_name);
> +                     if (nd) {
> +                             *prevp = nd;
> +                             nd->prevp = prevp;
> +                             prevp = &nd->next;
> +                             dir->num_ents++;
>                               dir->noents = FALSE;
> +                     }
> +
> +                     free (conn_req); free (url); free (f_name);
>               }
>       }
> -     return err;
> +
> +     return 0;
>  }
>  
> Index: httpfs/httpfs.c
> ===================================================================
> --- httpfs.orig/httpfs.c
> +++ httpfs/httpfs.c
> @@ -24,6 +24,7 @@
>  #include <errno.h>
>  #include <error.h>
>  #include <argp.h>
> +#include <argz.h>
>  
>  #include <hurd/netfs.h>
>  
> @@ -36,7 +37,8 @@ int mode;
>  int no_of_slashes = 0;
>  char *url, *conn_req;
>  char *ip_addr;
> -char *dir_tok[25];
> +char *dir_tok = NULL;

The [25] was bad indeed, but you can make it 
char **dir_tok, and in main, make a first pass to count how many items
we want, then allocate, then parse again to fill dir_tok. Otherwise
it'll be really more expensive all the time to go through it.

> +size_t dir_tok_len = 0;
>  struct files *list_of_entries = NULL, *this_entry;
>  
>  struct httpfs *httpfs;               /* filesystem global pointer */
> @@ -50,9 +52,12 @@ main (int argc, char **argv)
>  {
>    error_t err;
>    mach_port_t bootstrap;
> -  char *temp_url, *temp, *run;
> +  char *temp_url, *clean_url;
> +  char *host_name = NULL;
>    char type;
>    char *comm_buf; /* XXX: Is an http request limited to 200 bytes? */
> +
> +  /* Defaults */
>    port = 80;
>    debug_flag = 0;
>    mode = 1;                  /* means directory */
> @@ -70,77 +75,60 @@ main (int argc, char **argv)
>    if (err)
>      error (1, 0, "Map time error.");
>  
> -  if (strchr (url, '/') == NULL)
> -    error (1, 0, "Url must have a /, e.g., www.gnu.org/");
> +  extern char *url;

? There is no point in declaring it again, it's already defined above.

>  
> -  conn_req = (char *) malloc ((strlen (url) + 7) * sizeof (char));
> -  if (! conn_req)
> -    error (1, errno, "Cannot malloc conn_req.");
> -
> -  temp_url = strdup (url);
> -  if (! temp_url)
> -    error (1, errno, "Cannot duplicate url.");
> -
> -  if (!strncmp (temp_url, "http://";, 7))
> -    /* go ahead of http:// if given in url */
> -    temp_url = temp_url + 7;
> -
> -  if (strchr (temp_url, '/') == NULL)
> -    error (1, 0, "Url must have a /, e.g., www.gnu.org/");
> -
> -  /* XXX: strtok is not reentrant.  This will have to be fixed */
> -  temp = strdup (temp_url);
> -  url = strtok (temp, "/");
> -
> -  /* Find the directories given in URL */
> -  temp = strdup (temp_url);
> -  no_of_slashes++;
> -  strcpy (temp, strchr (temp, '/'));
> -  temp++;
> -  while (strchr (temp, '/') != NULL)
> -    {
> -      /* go to the end of url */
> -      run = strdup (temp);
> -      dir_tok[no_of_slashes - 1] = strtok (run, "/");
> -      strcpy (temp, strchr (temp, '/'));
> -      temp++;
> -      no_of_slashes++;
> -    }
> -  if (strlen (temp))
> -    {
> -      /* user has input a specific html file in the url */
> -      dir_tok[no_of_slashes - 1] = strdup (temp);
> -      dir_tok[no_of_slashes] = NULL;
> -    }
> -  else
> -    {
> -      /* user has input just an url no file names specifed 
> -       * assume the base url request is to index.html */
> -      dir_tok[no_of_slashes - 1] = strdup ("index.html");
> -      dir_tok[no_of_slashes] = strdup ("index.html");
> -    }
> +  if (url == NULL || strlen (url) == 0)
> +       error (1, 0, "URL must not be empty.");
> +
> +  /* Remove http:// if present */
> +  clean_url = url;
> +  if (strncasecmp (clean_url, "http://";, 7) == 0)
> +       clean_url = clean_url + 7;
> +
> +  /* Directory hierarchy creation with argz. This substitute strtok. */
> +  /* It breaks the string 'clean_url' every time it finds a '/' character. */
> +  if (argz_create_sep (clean_url, '/', &dir_tok, &dir_tok_len) != 0)
> +    error (1, errno, "Cannot create directory hierarchy from parsing the 
> URL.");

But then it makes it expensive to go through dir_tok, we don't want
that.

> +  /* Extracts hostname */
> +  host_name = dir_tok;
> +
> +  if (!host_name)
> +     error (1, 0, "Invalid URL: No hostname found.");
> +
> +  /* Build the complete URL for the GET request by iterating the argz vector 
> */
> +  /* e.g. conn_req: "http://host/path/file"; */
> +  size_t total_len = 7 + 1; /* "http://"; + null terminator */
> +  char *entry = NULL;
>  
> +  /* Calculate the require total length first */
> +  while ((entry = argz_next (dir_tok, dir_tok_len, entry)))
> +       total_len = total_len + strlen (entry) + 1; /* +1 is for the slash */
> +
> +  conn_req = (char *) malloc (total_len);
> +  if (!conn_req)
> +       error (1, errno, "Cannot allocate connection request string.");
> +
> +  /* Build our string */
>    strcpy (conn_req, "http://";);
> -  if (temp_url[strlen (temp_url) - 1] == '/')
> -    {
> -      strcat (conn_req, temp_url);
> -      err = asprintf (&comm_buf, "GET %s HTTP/1.0", conn_req);
> -    }
> -  else
> -    {
> -      while (strchr (temp_url, '/') != NULL)
> -     {
> -       temp = strdup (temp_url);
> -       strcat (conn_req, strtok (temp, "/"));
> -       strcat (conn_req, "/");
> -       strcpy (temp_url, strchr (temp_url, '/'));
> -       temp_url++;
> -     }
> -      err = asprintf (&comm_buf, "GET %s%s HTTP/1.0", conn_req, temp_url);
> -    }
> -  if (err < 0)  /* check the return value of asprintf */
> -    error (1, errno, "Cannot allocate comm_buf.");
>  
> +  entry = NULL;
> +  int first = 1;
> +  while ((entry = argz_next (dir_tok, dir_tok_len, entry))) {
> +     strcat (conn_req, entry);
> +     /* Add the slash if not the last element, or if it is the host */
> +     if (entry < (dir_tok + dir_tok_len - strlen (entry) - 1) || first)
> +             strcat (conn_req, "/");
> +
> +     first = 0;
> +  }
> +
> +  /* Creation of GET request buffer */
> +  /* TODO: For monder HTTP, we should add here "Host: %s\r\n" */
> +  if (asprintf (&comm_buf, "GET %s HTTP/1.0\r\n\r\n", conn_req) < 0)
> +    error (1, errno, "Cannot allocate command request string.");
> +
> +  /* Initialize the filesystem */
>    httpfs = (struct httpfs *) malloc (sizeof (struct httpfs));
>    if (! httpfs)
>      error (1, errno, "Cannot allocate httpfs.");
> @@ -149,26 +137,30 @@ main (int argc, char **argv)
>    httpfs->uid = getuid ();
>    httpfs->gid = getgid ();
>    httpfs->next_inode = 0;
> +
>    if (mode)
>      type = HTTP_DIR;
>    else
>      type = HTTP_FILE;
>  
> +  /* Create root node */
>    /* XXX: why is tmp hardcoded? */
>    httpfs->root = httpfs_make_node (type, url, conn_req, comm_buf, "tmp");
> +
>    netfs_init ();
> -  /* translator set to a directory */
> +
> +  /* If a directory, populates the contents. */
>    if (mode)
>      {
> -      /* fill the directory node with files 
> -       * call parser for that 
> +      /* fill the directory node with files
> +       * call parser for that
>         * only the current directory is filled
>         * subdirectories within them are indicated by type
>         * HTTP_DIR_UNFILLED, and are filled as on demand when an
>         * ls request comes for them */
>        err = parse (httpfs->root->nn);
>        if (err)
> -     error (1, err, "Error in Parsing.");
> +        error (1, err, "Error in Parsing.");
>      }
>  
>    if (debug_flag)
> @@ -176,10 +168,15 @@ main (int argc, char **argv)
>  
>    netfs_root_node = httpfs->root;
>    netfs_startup (bootstrap, 0);
> +
>    for (;;)
>      netfs_server_loop ();
>  
>    /* NOT REACHED */
> +  free (conn_req);
> +  free (dir_tok);
> +  free (comm_buf);
>    free (httpfs);
> +
>    return 0;
>  }
> Index: httpfs/parsehtml.c
> ===================================================================
> --- httpfs.orig/parsehtml.c
> +++ httpfs/parsehtml.c
> @@ -27,6 +27,7 @@
>  #include <libxml/HTMLparser.h>
>  #include <libxml/HTMLtree.h>
>  #include <libxml/SAX.h>
> +#include <argz.h>
>  
>  #include "httpfs.h"
>  
> @@ -110,12 +111,18 @@ error_t parse(struct netnode *node)
>       if ( debug_flag )
>               fprintf(stderr,"In the HTML parser for parsing %s\n",parent);
>       
> +     char *last_component = dir_tok;
> +     char *entry_ptr = NULL;
> +
> +     while ((entry_ptr = argz_next (dir_tok, dir_tok_len, entry_ptr)))
> +             last_component = entry_ptr;
> +
>       /* Create a file for base url */
>       if ( list_of_entries == NULL )
>       {
>               /* The node of the url entered */
>               list_of_entries = (struct files *)malloc(sizeof(struct files));
> -             list_of_entries->f_name = strdup(dir_tok[no_of_slashes-1]);
> +             list_of_entries->f_name = strdup(last_component);
>               list_of_entries->parent = strdup("tmp");
>               list_of_entries->f_size = 0;//content_len;
>               list_of_entries->f_type = HTTP_FILE;
> Index: httpfs/httpfs.h
> ===================================================================
> --- httpfs.orig/httpfs.h
> +++ httpfs/httpfs.h
> @@ -47,10 +47,8 @@ extern int mode;
>   * only contents of temp/ and its subdirectories can be supported */
>  extern int no_of_slashes;
>  
> -/* if the url points to particular file explicitly given store here 
> - * else assume it to be index.html 
> - * like www.gnu.org/gpl.html and www.gnu.org/  no file given so index.html */
> -extern char *dir_tok[25];
> +extern char *dir_tok;
> +extern size_t dir_tok_len;
>  
>  
>  /* handle all initial parameter parsing */


Samuel

Re: [PATCH 1/3] httpfs: Implement argz-based URL parsing

Reply via email to