[PATCH 1/3] httpfs: Implement argz-based URL parsing

Gianluca Cannata Tue, 13 Jan 2026 05:56:00 -0800

Hello Samuel and the bug-hurd team,

As requested, I am submitting my changes in separate, logical patches.


This is the first patch of the series. In this patch, I have
refactored the initial URL parsing logic to use the argz library.

Instead of simple string storage, the base URL is now stored as an argz vector.

This change provides a more idiomatic Hurd implementation and creates
the necessary infrastructure for the upcoming patches, which will
improve directory traversal and path normalization by iterating over
these argz components.

The code compiles cleanly and prepares the translator for the
subsequent path-handling fixes.

Best regards,

Gianluca

Index: httpfs.c
===================================================================
RCS file: /sources/hurdextras/httpfs/httpfs.c,v
retrieving revision 1.3
diff -u -r1.3 httpfs.c
--- httpfs.c    14 Jan 2013 10:47:05 -0000      1.3
+++ httpfs.c    13 Jan 2026 13:45:07 -0000
@@ -24,6 +24,7 @@
 #include <errno.h>
 #include <error.h>
 #include <argp.h>
+#include <argz.h>
 
 #include <hurd/netfs.h>
 
@@ -36,7 +37,8 @@
 int no_of_slashes = 0;
 char *url, *conn_req;
 char *ip_addr;
-char *dir_tok[25];
+char *dir_tok = NULL;
+size_t dir_tok_len = 0;
 struct files *list_of_entries = NULL, *this_entry;
 
 struct httpfs *httpfs;         /* filesystem global pointer */
@@ -50,9 +52,12 @@
 {
   error_t err;
   mach_port_t bootstrap;
-  char *temp_url, *temp, *run;
+  char *temp_url, *clean_url;
+  char *host_name = NULL;
   char type;
   char *comm_buf; /* XXX: Is an http request limited to 200 bytes? */
+
+  /* Defaults */
   port = 80;
   debug_flag = 0;
   mode = 1;                    /* means directory */
@@ -70,77 +75,60 @@
   if (err)
     error (1, 0, "Map time error.");
 
-  if (strchr (url, '/') == NULL)
-    error (1, 0, "Url must have a /, e.g., www.gnu.org/");
+  extern char *url;
 
-  conn_req = (char *) malloc ((strlen (url) + 7) * sizeof (char));
-  if (! conn_req)
-    error (1, errno, "Cannot malloc conn_req.");
-
-  temp_url = strdup (url);
-  if (! temp_url)
-    error (1, errno, "Cannot duplicate url.");
-
-  if (!strncmp (temp_url, "http://";, 7))
-    /* go ahead of http:// if given in url */
-    temp_url = temp_url + 7;
-
-  if (strchr (temp_url, '/') == NULL)
-    error (1, 0, "Url must have a /, e.g., www.gnu.org/");
-
-  /* XXX: strtok is not reentrant.  This will have to be fixed */
-  temp = strdup (temp_url);
-  url = strtok (temp, "/");
-
-  /* Find the directories given in URL */
-  temp = strdup (temp_url);
-  no_of_slashes++;
-  strcpy (temp, strchr (temp, '/'));
-  temp++;
-  while (strchr (temp, '/') != NULL)
-    {
-      /* go to the end of url */
-      run = strdup (temp);
-      dir_tok[no_of_slashes - 1] = strtok (run, "/");
-      strcpy (temp, strchr (temp, '/'));
-      temp++;
-      no_of_slashes++;
-    }
-  if (strlen (temp))
-    {
-      /* user has input a specific html file in the url */
-      dir_tok[no_of_slashes - 1] = strdup (temp);
-      dir_tok[no_of_slashes] = NULL;
-    }
-  else
-    {
-      /* user has input just an url no file names specifed 
-       * assume the base url request is to index.html */
-      dir_tok[no_of_slashes - 1] = strdup ("index.html");
-      dir_tok[no_of_slashes] = strdup ("index.html");
-    }
+  if (url == NULL || strlen (url) == 0)
+         error (1, 0, "URL must not be empty.");
+
+  /* Remove http:// if present */
+  clean_url = url;
+  if (strncasecmp (clean_url, "http://";, 7) == 0)
+         clean_url = clean_url + 7;
+
+  /* Directory hierarchy creation with argz. This substitute strtok. */
+  /* It breaks the string 'clean_url' every time it finds a '/' character. */
+  if (argz_create_sep (clean_url, '/', &dir_tok, &dir_tok_len) != 0)
+    error (1, errno, "Cannot create directory hierarchy from parsing the URL.");
+
+  /* Extracts hostname */
+  host_name = dir_tok;
+
+  if (!host_name)
+       error (1, 0, "Invalid URL: No hostname found.");
+
+  /* Build the complete URL for the GET request by iterating the argz vector */
+  /* e.g. conn_req: "http://host/path/file"; */
+  size_t total_len = 7 + 1; /* "http://"; + null terminator */
+  char *entry = NULL;
 
+  /* Calculate the require total length first */
+  while ((entry = argz_next (dir_tok, dir_tok_len, entry)))
+         total_len = total_len + strlen (entry) + 1; /* +1 is for the slash */
+
+  conn_req = (char *) malloc (total_len);
+  if (!conn_req)
+         error (1, errno, "Cannot allocate connection request string.");
+
+  /* Build our string */
   strcpy (conn_req, "http://";);
-  if (temp_url[strlen (temp_url) - 1] == '/')
-    {
-      strcat (conn_req, temp_url);
-      err = asprintf (&comm_buf, "GET %s HTTP/1.0", conn_req);
-    }
-  else
-    {
-      while (strchr (temp_url, '/') != NULL)
-       {
-         temp = strdup (temp_url);
-         strcat (conn_req, strtok (temp, "/"));
-         strcat (conn_req, "/");
-         strcpy (temp_url, strchr (temp_url, '/'));
-         temp_url++;
-       }
-      err = asprintf (&comm_buf, "GET %s%s HTTP/1.0", conn_req, temp_url);
-    }
-  if (err < 0)  /* check the return value of asprintf */
-    error (1, errno, "Cannot allocate comm_buf.");
 
+  entry = NULL;
+  int first = 1;
+  while ((entry = argz_next (dir_tok, dir_tok_len, entry))) {
+       strcat (conn_req, entry);
+       /* Add the slash if not the last element, or if it is the host */
+       if (entry < (dir_tok + dir_tok_len - strlen (entry) - 1) || first)
+               strcat (conn_req, "/");
+
+       first = 0;
+  }
+
+  /* Creation of GET request buffer */
+  /* TODO: For monder HTTP, we should add here "Host: %s\r\n" */
+  if (asprintf (&comm_buf, "GET %s HTTP/1.0\r\n\r\n", conn_req) < 0)
+    error (1, errno, "Cannot allocate command request string.");
+
+  /* Initialize the filesystem */
   httpfs = (struct httpfs *) malloc (sizeof (struct httpfs));
   if (! httpfs)
     error (1, errno, "Cannot allocate httpfs.");
@@ -149,26 +137,30 @@
   httpfs->uid = getuid ();
   httpfs->gid = getgid ();
   httpfs->next_inode = 0;
+
   if (mode)
     type = HTTP_DIR;
   else
     type = HTTP_FILE;
 
+  /* Create root node */
   /* XXX: why is tmp hardcoded? */
   httpfs->root = httpfs_make_node (type, url, conn_req, comm_buf, "tmp");
+
   netfs_init ();
-  /* translator set to a directory */
+
+  /* If a directory, populates the contents. */
   if (mode)
     {
-      /* fill the directory node with files 
-       * call parser for that 
+      /* fill the directory node with files
+       * call parser for that
        * only the current directory is filled
        * subdirectories within them are indicated by type
        * HTTP_DIR_UNFILLED, and are filled as on demand when an
        * ls request comes for them */
       err = parse (httpfs->root->nn);
       if (err)
-       error (1, err, "Error in Parsing.");
+        error (1, err, "Error in Parsing.");
     }
 
   if (debug_flag)
@@ -176,10 +168,15 @@
 
   netfs_root_node = httpfs->root;
   netfs_startup (bootstrap, 0);
+
   for (;;)
     netfs_server_loop ();
 
   /* NOT REACHED */
+  free (conn_req);
+  free (dir_tok);
+  free (comm_buf);
   free (httpfs);
+
   return 0;
 }
Index: httpfs.h
===================================================================
RCS file: /sources/hurdextras/httpfs/httpfs.h,v
retrieving revision 1.2
diff -u -r1.2 httpfs.h
--- httpfs.h    14 Jan 2013 10:47:05 -0000      1.2
+++ httpfs.h    13 Jan 2026 13:45:07 -0000
@@ -47,11 +47,17 @@
  * only contents of temp/ and its subdirectories can be supported */
 extern int no_of_slashes;
 
-/* if the url points to particular file explicitly given store here 
- * else assume it to be index.html 
- * like www.gnu.org/gpl.html and www.gnu.org/  no file given so index.html */
-extern char *dir_tok[25];
-
+/* dir_tok points to a vector of strings in the GNU argz format.
+ * It contains the components of the original URL separated by the null byte ('\0').
+ * E.g. if the URL is http://gnu.org/software/hurd, the vector will contain:
+ * ["gnu.org", "software", "hurd", "index.html"]
+ * It is used to navigate the directory structure of the remote web server */
+extern char *dir_tok;
+
+/* dir_tok_len stores the total length in bytes of the vector dir_tok.
+ * It is necessary for functions like argz_next() and argz_count()
+ * to determine the boundaries of the allocated memory. */
+extern size_t dir_tok_len;
 
 /* handle all initial parameter parsing */
 error_t httpfs_parse_args (int argc, char **argv);
Index: parsehtml.c
===================================================================
RCS file: /sources/hurdextras/httpfs/parsehtml.c,v
retrieving revision 1.2
diff -u -r1.2 parsehtml.c
--- parsehtml.c 27 Jan 2013 23:14:23 -0000      1.2
+++ parsehtml.c 13 Jan 2026 13:45:07 -0000
@@ -27,6 +27,7 @@
 #include <libxml/HTMLparser.h>
 #include <libxml/HTMLtree.h>
 #include <libxml/SAX.h>
+#include <argz.h>
 
 #include "httpfs.h"
 
@@ -110,12 +111,18 @@
        if ( debug_flag )
                fprintf(stderr,"In the HTML parser for parsing %s\n",parent);
        
+       char *last_component = dir_tok;
+       char *entry_ptr = NULL;
+
+       while ((entry_ptr = argz_next (dir_tok, dir_tok_len, entry_ptr)))
+               last_component = entry_ptr;
+
        /* Create a file for base url */
        if ( list_of_entries == NULL )
        {
                /* The node of the url entered */
                list_of_entries = (struct files *)malloc(sizeof(struct files));
-               list_of_entries->f_name = strdup(dir_tok[no_of_slashes-1]);
+               list_of_entries->f_name = strdup(last_component);
                list_of_entries->parent = strdup("tmp");
                list_of_entries->f_size = 0;//content_len;
                list_of_entries->f_type = HTTP_FILE;
Index: extract.c
===================================================================
RCS file: /sources/hurdextras/httpfs/extract.c,v
retrieving revision 1.2
diff -u -r1.2 extract.c
--- extract.c   27 Jan 2013 23:16:14 -0000      1.2
+++ extract.c   13 Jan 2026 13:45:07 -0000
@@ -24,6 +24,7 @@
 #include<netdb.h>
 #include<stdlib.h>
 #include<unistd.h>
+#include<argz.h>
 
 #include"httpfs.h"
 
@@ -117,7 +118,7 @@
                        strcpy(temp,strchr(temp,'/'));
                        temp++;
                }
-               if ( no_of_slashes_here < no_of_slashes ) 
+               if ( no_of_slashes_here < argz_count (dir_tok, dir_tok_len) ) 
                {
                        /* not going to support top level directory
                         * from a given directory
@@ -137,7 +138,12 @@
                }
 
                token = strdupa(string);
-               for ( i=0 ; i<no_of_slashes-1 ; i++ ) 
+
+               int base_depth = argz_count (dir_tok, dir_tok_len);
+               char *current_dir_tok_part = dir_tok;
+
+
+               for ( i=0 ; i<(base_depth-1) ; i++ ) 
                {
                        /* extract file and directory from the string
                         * like /file/temp/a.html
@@ -145,11 +151,19 @@
                         * extract temp fill it as a directory under file/
                         * extract a.html fill it as a file under temp/ */
                        
-                       temp = strdupa(token); 
-                       if ( strcmp(dir_tok[i],strtok(temp,"/")) )
-                               return;
-                       strcpy(token,strchr(token,'/'));
-                       token++;
+                       temp = strdupa(token);
+                       char *current_token = strtok (temp, "/");
+
+                       if (current_dir_tok_part && current_token)
+                               if (strcmp (current_dir_tok_part, current_token) != 0)
+                                       return;
+
+                       char *next_slash = strchr (token, '/');
+                       if (next_slash);
+                               token = next_slash + 1;
+
+                       current_dir_tok_part = argz_next (dir_tok, dir_tok_len, current_dir_tok_part);
+
                }
                parent = strdupa("tmp");
                string = strdupa(token);

[PATCH 1/3] httpfs: Implement argz-based URL parsing

Reply via email to