@Padraig:
I'll be away from keyboard the next 3 weeks, so please continue
with this one to finally push.


On 07/25/2016 01:21 PM, Philipp Thomas wrote:
> * Pádraig Brady ([email protected]) [20160725 01:20]:
> 
>>> otherwise it looks good to me.
>>
>> Excellent. +1
> 
> I'll pass on your kind words to my collegue.
> 
>> Worth a mention in improvements in NEWS too.
> 
> Like so?
> 
> +  Df now uses a hash table to filter the mount list. This sped up
> +  processing of a 22000 lines mount list by nearly 60%.
> 
> If yes and Bernies explains me why we'd need an additional hash_free call
> I'll post the changed patch.

Thanks.  I wrapped this into a patch with a nice commit message.
I also squashed in the renaming of the 'devlist' variable, as that
many "devlists" were too confusing.

I see this hashing as a starting point, as other functions also need
a better lookup ... and to avoid superfluous stat/statfs calls (see
e.g. "strace -e stat,statfs df -a").

BTW: surprisingly, one can get a HUGE /proc/self/mountinfo very easily:

  # !!! DO NOT DO THIS (with "seq 25") !!!
  mkdir /root/tmp &&
  for f in $(seq 25); do \
    mkdir /root/tmp/$f \
      && mount --bind / /root/tmp/$f
  done

To get the 20.000-30.000 entries, you only need ~"seq 16".

Have a nice day,
Berny
>From 93ad559870eec2d827c8f1d34b53c4f2a512b245 Mon Sep 17 00:00:00 2001
From: Philipp Thomas <[email protected]>
Date: Sun, 31 Jul 2016 21:24:18 +0200
Subject: [PATCH] df: improve performance with many mount points

Use hash table for seaching in filter_mount_list()
and get_dev().

* src/df.c (devlist_table): Define hash table.
(devlist_hash): Add hash function.
(devlist_compare): Add hash comparison function.
(devlist_for_dev): Add lookup function.
(devlist_free): Add cleanup function.
(filter_mount_list): Use the above hash table.
While at it, rename the variable 'devlist' to 'seen_dev' for
better readability.
(me_for_dev): Use the above lookup function.

Original version of the patch by Josef Cejka <[email protected]>.
---
 NEWS     |   3 ++
 src/df.c | 109 ++++++++++++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/NEWS b/NEWS
index 736b95e..4c28fc1 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,9 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 ** Improvements
 
+  df now uses a hash table to filter the mount list.  This speeds up
+  processing of a 22000 lines mount list by nearly 60%.
+
   install -Z now also sets the default SELinux context for created directories.
 
   stat and tail now know about "prl_fs" (a parallels file system),
diff --git a/src/df.c b/src/df.c
index cbd8ef5..3813346 100644
--- a/src/df.c
+++ b/src/df.c
@@ -34,6 +34,7 @@
 #include "mountlist.h"
 #include "quote.h"
 #include "find-mount-point.h"
+#include "hash.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
 #define PROGRAM_NAME "df"
@@ -43,14 +44,16 @@
   proper_name ("David MacKenzie"), \
   proper_name ("Paul Eggert")
 
-/* Filled with device numbers of examined file systems to avoid
-   duplicates in output.  */
-static struct devlist
+struct devlist
 {
   dev_t dev_num;
   struct mount_entry *me;
   struct devlist *next;
-} *device_list;
+};
+
+/* Filled with device numbers of examined file systems to avoid
+   duplicates in output.  */
+static Hash_table *devlist_table;
 
 /* If true, show even file systems with zero size or
    uninteresting types.  */
@@ -603,23 +606,67 @@ excluded_fstype (const char *fstype)
   return false;
 }
 
+static size_t
+devlist_hash (void const *x, size_t table_size)
+{
+  struct devlist const *p = x;
+  return (uintmax_t) p->dev_num % table_size;
+}
+
+static bool
+devlist_compare (void const *x, void const *y)
+{
+  struct devlist const *a = x;
+  struct devlist const *b = y;
+  return a->dev_num == b->dev_num;
+}
+
+static struct devlist *
+devlist_for_dev (dev_t dev)
+{
+  if (devlist_table == NULL)
+    return NULL;
+  struct devlist dev_entry;
+  dev_entry.dev_num = dev;
+  return hash_lookup(devlist_table, &dev_entry);
+}
+
+static void
+devlist_free(void *p)
+{
+  free(p);
+}
+
 /* Filter mount list by skipping duplicate entries.
    In the case of duplicates - based on the device number - the mount entry
    with a '/' in its me_devname (i.e., not pseudo name like tmpfs) wins.
    If both have a real devname (e.g. bind mounts), then that with the shorter
    me_mountdir wins.  With DEVICES_ONLY == true (set with df -a), only update
-   the global device_list, rather than filtering the global mount_list.  */
+   the global devlist_table, rather than filtering the global mount_list.  */
 
 static void
 filter_mount_list (bool devices_only)
 {
   struct mount_entry *me;
 
+  /* Temporary list to keep entries ordered.  */
+  struct devlist *device_list = NULL;
+  int mount_list_size = 0;
+
+  for (me = mount_list; me; me = me->me_next)
+    mount_list_size++;
+
+  devlist_table = hash_initialize (mount_list_size, NULL,
+                                 devlist_hash,
+                                 devlist_compare,
+                                 devlist_free);
+  if (devlist_table == NULL)
+    xalloc_die ();
+
   /* Sort all 'wanted' entries into the list device_list.  */
   for (me = mount_list; me;)
     {
       struct stat buf;
-      struct devlist *devlist;
       struct mount_entry *discard_me = NULL;
 
       /* Avoid stating remote file systems as that may hang.
@@ -635,21 +682,19 @@ filter_mount_list (bool devices_only)
       else
         {
           /* If we've already seen this device...  */
-          for (devlist = device_list; devlist; devlist = devlist->next)
-            if (devlist->dev_num == buf.st_dev)
-              break;
+          struct devlist *seen_dev = devlist_for_dev(buf.st_dev);
 
-          if (devlist)
+          if (seen_dev)
             {
-              bool target_nearer_root = strlen (devlist->me->me_mountdir)
+              bool target_nearer_root = strlen (seen_dev->me->me_mountdir)
                                         > strlen (me->me_mountdir);
               /* With bind mounts, prefer items nearer the root of the source */
-              bool source_below_root = devlist->me->me_mntroot != NULL
+              bool source_below_root = seen_dev->me->me_mntroot != NULL
                                        && me->me_mntroot != NULL
-                                       && (strlen (devlist->me->me_mntroot)
+                                       && (strlen (seen_dev->me->me_mntroot)
                                            < strlen (me->me_mntroot));
-              if (! print_grand_total && me->me_remote && devlist->me->me_remote
-                  && ! STREQ (devlist->me->me_devname, me->me_devname))
+              if (! print_grand_total && me->me_remote && seen_dev->me->me_remote
+                  && ! STREQ (seen_dev->me->me_devname, me->me_devname))
                 {
                   /* Don't discard remote entries with different locations,
                      as these are more likely to be explicitly mounted.
@@ -658,21 +703,21 @@ filter_mount_list (bool devices_only)
                 }
               else if ((strchr (me->me_devname, '/')
                        /* let "real" devices with '/' in the name win.  */
-                        && ! strchr (devlist->me->me_devname, '/'))
+                        && ! strchr (seen_dev->me->me_devname, '/'))
                        /* let points towards the root of the device win.  */
                        || (target_nearer_root && ! source_below_root)
                        /* let an entry overmounted on a new device win...  */
-                       || (! STREQ (devlist->me->me_devname, me->me_devname)
+                       || (! STREQ (seen_dev->me->me_devname, me->me_devname)
                            /* ... but only when matching an existing mnt point,
                               to avoid problematic replacement when given
                               inaccurate mount lists, seen with some chroot
                               environments for example.  */
                            && STREQ (me->me_mountdir,
-                                     devlist->me->me_mountdir)))
+                                     seen_dev->me->me_mountdir)))
                 {
                   /* Discard mount entry for existing device.  */
-                  discard_me = devlist->me;
-                  devlist->me = me;
+                  discard_me = seen_dev->me;
+                  seen_dev->me = me;
                 }
               else
                 {
@@ -691,12 +736,14 @@ filter_mount_list (bool devices_only)
         }
       else
         {
-          /* Add the device number to the global list devlist.  */
-          devlist = xmalloc (sizeof *devlist);
+          /* Add the device number to the device_table.  */
+          struct devlist *devlist = xmalloc (sizeof *devlist);
           devlist->me = me;
           devlist->dev_num = buf.st_dev;
           devlist->next = device_list;
           device_list = devlist;
+          if (hash_insert (devlist_table, devlist) == NULL)
+            xalloc_die ();
 
           me = me->me_next;
         }
@@ -711,28 +758,24 @@ filter_mount_list (bool devices_only)
         me = device_list->me;
         me->me_next = mount_list;
         mount_list = me;
-        /* Free devlist entry and advance.  */
-        struct devlist *devlist = device_list->next;
-        free (device_list);
-        device_list = devlist;
+        device_list = device_list->next;
       }
+
+      hash_free(devlist_table);
+      devlist_table = NULL;
   }
 }
 
+
 /* Search a mount entry list for device id DEV.
    Return the corresponding mount entry if found or NULL if not.  */
 
 static struct mount_entry const * _GL_ATTRIBUTE_PURE
 me_for_dev (dev_t dev)
 {
-  struct devlist *dl = device_list;
-
-  while (dl)
-    {
-      if (dl->dev_num == dev)
+  struct devlist *dl = devlist_for_dev(dev);
+  if (dl)
         return dl->me;
-      dl = dl->next;
-    }
 
   return NULL;
 }
-- 
2.1.4

Reply via email to