On Tue, Apr 19, 2016 at 2:15 PM, Ron Yorston <[email protected]> wrote:
> The string of applet names seems ripe for compression:  it has a limited
> range of characters and many common substrings.  If the compression is
> too aggressive, though, the code required to handle it causes bloat.
>
> Recursively replace common pairs of characters with a character having
> its top bit set; the lower 7 bits being an index into an array of
> character pairs.  This is only done if it results in a net saving,
> otherwise the current code is used.
>
> Provide an implementation of find_applet_by_name() which uses binary
> search:  this reduces the number of string comparisons from an average
> of 27.9 per lookup to 7.7.  This is important because of the added cost
> of expanding strings before comparison.  (Compressing the input string
> is uncompetitive.)
>
> The new version of find_applet_by_name() is about 8% faster than the
> old one when strings aren't abbreviated.  It isn't used in that case,
> though, as it's larger than the current implementation.
>
> When strings are abbreviated the new find_applet_by_name() is about
> 34% slower than currently.  (All comparisons are for the default
> configuration.)

Crazy :D


> function                                             old     new   delta
> applet_common                                          -     256    +256
> find_applet_by_name                                  112     175     +63
> expand_char                                            -      40     +40
> expand_name                                            -      31     +31
> run_applet_and_exit                                  675     697     +22
> applet_nameofs                                        14      30     +16
> applet_names                                        2536    1630    -906
> ------------------------------------------------------------------------------
> (add/remove: 3/0 grow/shrink: 3/1 up/down: 428/-906)         Total: -478 bytes
>    text    data     bss     dec     hex filename
>  764477    2059    8808  775344   bd4b0 busybox_old
>  763999    2059    8832  774890   bd2ea busybox_unstripped


How about this version?

function                                             old     new   delta
applet_common                                          -     256    +256
find_applet_by_name                                  128     200     +72
expand_char                                            -      48     +48
expand_name                                            -      35     +35
run_applet_and_exit                                  688     717     +29
applet_names                                        2540    1592    -948
------------------------------------------------------------------------------
(add/remove: 3/0 grow/shrink: 2/1 up/down: 440/-948)         Total: -508 bytes
   text       data        bss        dec        hex    filename
 923812        906      17160     941878      e5f36    busybox_old
 923304        906      17176     941386      e5d4a    busybox_unstripped
diff --git a/applets/applet_tables.c b/applets/applet_tables.c
index 843f2ec..69d451a 100644
--- a/applets/applet_tables.c
+++ b/applets/applet_tables.c
@@ -8,12 +8,13 @@
  * Licensed under GPLv2, see file LICENSE in this source tree.
  */
 #include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 #include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
 #include <ctype.h>
 
 #undef ARRAY_SIZE
@@ -22,6 +23,32 @@
 #include "../include/autoconf.h"
 #include "../include/applet_metadata.h"
 
+static int verbose = 0;
+
+static void overlapping_strcpy(char *dst, const char *src)
+{
+       /* Cheap optimization for dst == src case -
+        * better to have it here than in many callers.
+        */
+       if (dst != src) {
+               while ((*dst = *src) != '\0') {
+                       dst++;
+                       src++;
+               }
+       }
+}
+
+static int str_isalnum_(const char *s)
+{
+       while (*s) {
+               if (!isalnum(*s) && *s != '_')
+                       return 0;
+               s++;
+       }
+       return 1;
+}
+
+
 struct bb_applet {
        const char *name;
        const char *main;
@@ -34,6 +61,7 @@ struct bb_applet {
        /* true if instead of fork(); exec("applet"); waitpid();
         * one can simply call applet_main(argc,argv); */
        unsigned char nofork;
+       char *abbrev;
 };
 
 /* Define struct bb_applet applets[] */
@@ -48,35 +76,137 @@ static int cmp_name(const void *a, const void *b)
        return strcmp(aa->name, bb->name);
 }
 
-static int str_isalnum_(const char *s)
+#define SUBLEN 2
+
+struct bb_substr {
+       char str[SUBLEN+1];
+       unsigned count;
+};
+
+static int numsub = 0;
+static struct bb_substr substr[128];
+
+static char *hex_char(unsigned char c)
 {
-       while (*s) {
-               if (!isalnum(*s) && *s != '_')
-                       return 0;
-               s++;
+       static char expand[8][8];
+       static unsigned index = 0;
+       char *t;
+
+       t = expand[index];
+        index = (index + 1) % 8;
+
+       t[0] = c;
+       t[1] = '\0';
+       if (c & 0x80) {
+               sprintf(t, "\\%03o", c);
        }
-       return 1;
+       return t;
+}
+
+static char *expand_char(char *t, unsigned char c)
+{
+       if (c & 0x80) {
+               /* substitute common substring for char with top bit set */
+               const char *u = substr[c & 0x7f].str;
+               if (c == (unsigned char)u[0]) {
+                       fprintf(stderr, "u0:%02x!!!\n", c);
+                       abort();
+               }
+               if (c == (unsigned char)u[1]) {
+                       fprintf(stderr, "u1:%02x!!!\n", c);
+                       abort();
+               }
+               t = expand_char(t, u[0]);
+               t = expand_char(t, u[1]);
+               return t;
+       }
+       *t++ = c;
+       return t;
+}
+static const char *expand_name(const char *name)
+{
+       static char expand[8][64];
+       static unsigned index = 0;
+       char *tt, *t;
+
+       t = tt = expand[index];
+       index = (index + 1) % 8;
+
+       while (*name)
+               t = expand_char(t, *name++);
+       *t = '\0';
+
+       return tt;
+}
+
+static char *find_most_frequent_substring(void)
+{
+       int best_count;
+       char best_sub[SUBLEN+1];
+       char sub[SUBLEN+1];
+       int i, j, k;
+
+       best_count = 0;
+       for (i = 0; i < NUM_APPLETS; i++) {
+               int len;
+
+               len = strlen(applets[i].abbrev);
+               if (len < SUBLEN)
+                       continue;
+
+               for (j = 0; j < len-SUBLEN+1; j++) {
+                       int count;
+
+                       strncpy(sub, applets[i].abbrev+j, SUBLEN);
+                       sub[SUBLEN] = '\0';
+
+                       count = 0;
+                       for (k = i; k < NUM_APPLETS; k++) {
+                               char *p = applets[k].abbrev;
+                               for (;;) {
+                                       p = strstr(p, sub);
+                                       if (!p)
+                                               break;
+                                       count++;
+                                       p += SUBLEN;
+                               }
+                       }
+
+                       if (best_count < count) {
+                               best_count = count;
+                               strcpy(best_sub, sub);
+                       }
+               }
+       }
+       if (best_count < 3)
+               return NULL;
+
+       if (verbose) {
+               fprintf(stderr, "best count:%d %02x %02x '%s'\n",
+                       best_count,
+                       (uint8_t)best_sub[0],
+                       (uint8_t)best_sub[1],
+                       expand_name(best_sub)
+               );
+       }
+       substr[numsub].count = best_count;
+       return strcpy(substr[numsub].str, best_sub);
 }
 
 int main(int argc, char **argv)
 {
-       int i, j;
-
-       // In find_applet_by_name(), before linear search, narrow it down
-       // by looking at N "equidistant" names. With ~350 applets:
-       // KNOWN_APPNAME_OFFSETS  cycles
-       //                     0    9057
-       //                     2    4604 + ~100 bytes of code
-       //                     4    2407 + 4 bytes
-       //                     8    1342 + 8 bytes
-       //                    16     908 + 16 bytes
-       //                    32     884 + 32 bytes
-       // With 8, int16_t applet_nameofs[] table has 7 elements.
+       int i, j, n;
+       int MAX_APPLET_NAME_LEN = 0;
        int KNOWN_APPNAME_OFFSETS = 8;
-       // With 128 applets we do two linear searches, with 1..7 strcmp's in 
the first one
-       // and 1..16 strcmp's in the second. With 256 apps, second search does 
1..32 strcmp's.
-       if (NUM_APPLETS < 128)
+       int ABBREV_APPLET_NAMES = 1;
+
+       if (getenv("GENVERBOSE"))
+               verbose = atoi(getenv("GENVERBOSE"));
+
+       if (NUM_APPLETS < 128) {
                KNOWN_APPNAME_OFFSETS = 4;
+               ABBREV_APPLET_NAMES = 0;
+       }
        if (NUM_APPLETS < 32)
                KNOWN_APPNAME_OFFSETS = 0;
 
@@ -99,6 +229,105 @@ int main(int argc, char **argv)
                printf("#define SINGLE_APPLET_MAIN %s_main\n", applets[0].main);
        }
 
+       for (i = 0; i < NUM_APPLETS; i++) {
+               const char *t;
+
+               applets[i].abbrev = strdup(applets[i].name);
+               for (t = applets[i].name; ABBREV_APPLET_NAMES && *t; t++) {
+                       /* if any char has its top bit set we can't use 
substrings */
+                       if (*t & 0x80)
+                               ABBREV_APPLET_NAMES = 0;
+               }
+               if (MAX_APPLET_NAME_LEN < strlen(applets[i].name))
+                       MAX_APPLET_NAME_LEN = strlen(applets[i].name);
+       }
+
+       /* determine if applet names can be compressed */
+       numsub = 0;
+       if (ABBREV_APPLET_NAMES) {
+               int oldlen, newlen, newtot;
+
+               while (numsub < 128) {
+                       char *sub;
+
+                       sub = find_most_frequent_substring();
+                       if (!sub)
+                               break;
+///fprintf(stderr, "%02x: %02x%02x\n", numsub+0x80, (uint8_t)sub[0], 
(uint8_t)sub[1]);
+                       /* replace each instance of substring with a character 
with */
+                       /* its top bit set, the remaining 7 bits are its index 
*/
+                       for (i = 0; i < NUM_APPLETS; i++) {
+                               for (;;) {
+                                       char *s = strstr(applets[i].abbrev, 
sub);
+                                       if (!s)
+                                               break;
+                                       *s = 0x80 + numsub;
+                                       overlapping_strcpy(s + 1, s + SUBLEN);
+                               }
+///fprintf(stderr, "'%s' -> '%s'\n", applets[i].main, applets[i].abbrev);
+                       }
+                       numsub++;
+               }
+
+               /* calculate lengths of new and old strings */
+               oldlen = newlen = 1;
+               for (i = 0; i < NUM_APPLETS; i++) {
+                       oldlen += strlen(applets[i].name) + 1;
+                       newlen += strlen(applets[i].abbrev) + 1;
+               }
+
+               /* only use substrings if the saving outweighs the larger code 
*/
+               newtot = newlen + (numsub*SUBLEN) + 160 + 
2*KNOWN_APPNAME_OFFSETS;
+               ABBREV_APPLET_NAMES = (newtot < oldlen);
+
+               if (ABBREV_APPLET_NAMES) {
+                       printf("/* strings reduced from %d to %d (%d+%d) bytes 
*/\n",
+                               oldlen, newlen + (numsub * SUBLEN),
+                               newlen, numsub * SUBLEN
+                       );
+               }
+       }
+
+       printf("#define ABBREV_APPLET_NAMES %d\n\n", ABBREV_APPLET_NAMES);
+
+       //printf("#ifndef SKIP_definitions\n");
+       if (ABBREV_APPLET_NAMES) {
+               printf("static const char applet_common[][2] ALIGN1 = {\n");
+               for (n = 0; n < numsub; n++) {
+                       printf("/* %s %2u %-8s */",
+                               hex_char(n + 0x80),
+                               substr[n].count,
+                               expand_name(substr[n].str)
+                       );
+                       printf(" {'%s','%s'},\n",
+                               hex_char(substr[n].str[0]),
+                               hex_char(substr[n].str[1])
+                       );
+               }
+               printf("};\n\n");
+
+               printf("const char applet_names[] ALIGN1 = \"\"\n");
+               for (i = 0; i < NUM_APPLETS; i++) {
+                       char *s;
+
+                       printf("/* %-*s */ \"", MAX_APPLET_NAME_LEN,
+                               expand_name(applets[i].abbrev)
+                       );
+                       for (s = applets[i].abbrev; *s; s++) {
+                               printf("%s", hex_char(*s));
+                       }
+                       printf("\" \"\\0\"\n");
+               }
+               printf(";\n\n");
+       }
+       else {
+               printf("const char applet_names[] ALIGN1 = \"\"\n");
+               for (i = 0; i < NUM_APPLETS; i++) {
+                       printf("\"%s\" \"\\0\"\n", applets[i].name);
+               }
+               printf(";\n\n");
+       }
+
        printf("#define KNOWN_APPNAME_OFFSETS %u\n\n", KNOWN_APPNAME_OFFSETS);
        if (KNOWN_APPNAME_OFFSETS > 0) {
                int ofs, offset[KNOWN_APPNAME_OFFSETS], 
index[KNOWN_APPNAME_OFFSETS];
@@ -109,26 +338,20 @@ int main(int argc, char **argv)
                        for (j = 0; j < KNOWN_APPNAME_OFFSETS; j++)
                                if (i == index[j])
                                        offset[j] = ofs;
-                       ofs += strlen(applets[i].name) + 1;
+                       if (i == NUM_APPLETS-1)
+                               printf("#define LAST_APPLET_NAME_OFFSET 
%d\n\n", ofs);
+                       ofs += strlen(ABBREV_APPLET_NAMES ?
+                                       applets[i].abbrev : applets[i].name) + 
1;
                }
                /* If the list of names is too long refuse to proceed */
                if (ofs > 0xffff)
                        return 1;
-               printf("const uint16_t applet_nameofs[] ALIGN2 = {\n");
+               printf("static const uint16_t applet_nameofs[] ALIGN2 = {\n");
                for (i = 1; i < KNOWN_APPNAME_OFFSETS; i++)
                        printf("%d,\n", offset[i]);
                printf("};\n\n");
        }
 
-       //printf("#ifndef SKIP_definitions\n");
-       printf("const char applet_names[] ALIGN1 = \"\"\n");
-       for (i = 0; i < NUM_APPLETS; i++) {
-               printf("\"%s\" \"\\0\"\n", applets[i].name);
-//             if (MAX_APPLET_NAME_LEN < strlen(applets[i].name))
-//                     MAX_APPLET_NAME_LEN = strlen(applets[i].name);
-       }
-       printf(";\n\n");
-
        for (i = 0; i < NUM_APPLETS; i++) {
                if (str_isalnum_(applets[i].name))
                        printf("#define APPLET_NO_%s %d\n", applets[i].name, i);
@@ -190,26 +413,32 @@ int main(int argc, char **argv)
        printf("};\n");
 #endif
        //printf("#endif /* SKIP_definitions */\n");
-//     printf("\n");
-//     printf("#define MAX_APPLET_NAME_LEN %u\n", MAX_APPLET_NAME_LEN);
+       printf("\n");
+       printf("#define MAX_APPLET_NAME_LEN %u\n", MAX_APPLET_NAME_LEN);
 
        if (argv[2]) {
-               char line_old[80];
-               char line_new[80];
+               char line_old[2][80];
+               char line_new[2][80];
                FILE *fp;
 
-               line_old[0] = 0;
+               line_old[0][0] = '\0';
+               line_old[1][0] = '\0';
                fp = fopen(argv[2], "r");
                if (fp) {
-                       fgets(line_old, sizeof(line_old), fp);
+                       fgets(line_old[0], sizeof(line_old[0]), fp);
+                       fgets(line_old[1], sizeof(line_old[1]), fp);
                        fclose(fp);
                }
-               sprintf(line_new, "#define NUM_APPLETS %u\n", NUM_APPLETS);
-               if (strcmp(line_old, line_new) != 0) {
+               sprintf(line_new[0], "#define NUM_APPLETS %u\n", NUM_APPLETS);
+               sprintf(line_new[1], ABBREV_APPLET_NAMES ?
+                               "\n" : "#define expand_name(n) (n)\n");
+               if (strcmp(line_old[0], line_new[0]) != 0 ||
+                               strcmp(line_old[1], line_new[1]) != 0) {
                        fp = fopen(argv[2], "w");
                        if (!fp)
                                return 1;
-                       fputs(line_new, fp);
+                       fputs(line_new[0], fp);
+                       fputs(line_new[1], fp);
                }
        }
 
diff --git a/applets/usage_pod.c b/applets/usage_pod.c
index 0b1c4aa..c73a2c0 100644
--- a/applets/usage_pod.c
+++ b/applets/usage_pod.c
@@ -15,6 +15,7 @@
 #define SKIP_applet_main
 #define ALIGN1 /* nothing, just to placate applet_tables.h */
 #define ALIGN2 /* nothing, just to placate applet_tables.h */
+#define ALIGN4 /* nothing, just to placate applet_tables.h */
 #include "applet_tables.h"
 
 /* Since we can't use platform.h, have to do this again by hand: */
diff --git a/include/libbb.h b/include/libbb.h
index 111dd66..714283b 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1229,6 +1229,7 @@ const struct hwtype *get_hwntype(int type) FAST_FUNC;
 
 #ifndef BUILD_INDIVIDUAL
 extern int find_applet_by_name(const char *name) FAST_FUNC;
+extern const char *expand_name(const char *name) FAST_FUNC;
 /* Returns only if applet is not found. */
 extern void run_applet_and_exit(const char *name, char **argv) FAST_FUNC;
 extern void run_applet_no_and_exit(int a, char **argv) NORETURN FAST_FUNC;
diff --git a/libbb/appletlib.c b/libbb/appletlib.c
index b682e6b..fc4e171 100644
--- a/libbb/appletlib.c
+++ b/libbb/appletlib.c
@@ -139,6 +139,7 @@ void FAST_FUNC bb_show_usage(void)
        xfunc_die();
 }
 
+#if !ABBREV_APPLET_NAMES
 int FAST_FUNC find_applet_by_name(const char *name)
 {
        unsigned i, max;
@@ -265,6 +266,102 @@ int FAST_FUNC find_applet_by_name(const char *name)
        return -1;
 #endif
 }
+#define expand_name(n) (n)
+#else /* ABBREV_APPLET_NAMES */
+static char *expand_char(char *t, int c)
+{
+       if (c & 0x80) {
+               /* substitute common substring for char with top bit set */
+               const char *u = applet_common[c & 0x7f];
+               t = expand_char(t, u[0]);
+               t = expand_char(t, u[1]);
+               return t;
+       }
+       *t++ = c;
+       return t;
+}
+
+const char * FAST_FUNC expand_name(const char *name)
+{
+       static char expand[MAX_APPLET_NAME_LEN+1];
+       char *t = expand;
+
+       while (*name)
+               t = expand_char(t, *name++);
+       *t = '\0';
+
+       return expand;
+}
+
+int FAST_FUNC find_applet_by_name(const char *name)
+{
+       int i, ret, lower, upper, middle;
+
+       /* Do a binary search to find the applet entry given the name. */
+
+       lower = 0;
+       upper = LAST_APPLET_NAME_OFFSET;
+       do {
+               middle = (upper + lower) >> 1;
+               /* the new index is probably not at the start of a name */
+               /* first move forward to the start of the next name... */
+               while (applet_names[middle++])
+                       continue;
+               if (middle == upper) {
+                       /* ... if that didn't work go back to start of previous 
name, */
+                       /* taking care not to fall off the start of the string 
*/
+                       do {
+                               middle--;
+                       } while (middle && applet_names[middle-1]);
+                       if (middle == lower) {
+                               /*
+                                * End of binary search.  The bounds are either 
values
+                                * that were previously 'middle' and have been 
checked
+                                * already or are 0 or LAST_APPLET_NAME_OFFSET, 
which
+                                * still need to be checked.
+                                */
+                               if (lower == 0)
+                                       /* middle = lower */;
+                               else if (upper == LAST_APPLET_NAME_OFFSET)
+                                       middle = upper;
+                               else
+                                       break;
+                       }
+               }
+               ret = strcmp(name, expand_name(applet_names+middle));
+               if (ret < 0)
+                       upper = middle;
+               else if (ret > 0)
+                       lower = middle;
+               else
+                       goto found;
+       } while (lower != upper);
+
+       return -1;
+
+ found:
+       /*
+        * Find index of name by linear search, using the known offsets to
+        * limit the range we need to search.
+        */
+       lower = 0;
+       ret = NUM_APPLETS * (KNOWN_APPNAME_OFFSETS-1);
+       for (i = KNOWN_APPNAME_OFFSETS-2; i >= 0; i--) {
+               if (middle >= applet_nameofs[i]) {
+                       lower = applet_nameofs[i];
+                       ret /= KNOWN_APPNAME_OFFSETS;
+                       break;
+               }
+               ret -= NUM_APPLETS;
+       }
+       while (lower < middle) {
+               lower += strlen(applet_names+lower) + 1;
+               ++ret;
+       }
+
+       return ret;
+}
+#endif
 
 
 void lbb_prepare(const char *applet
@@ -678,7 +775,7 @@ static void install_links(const char *busybox, int 
use_symbolic_links,
         * busybox.h::bb_install_loc_t, or else... */
        int (*lf)(const char *, const char *);
        char *fpc;
-        const char *appname = applet_names;
+       const char *appname = applet_names;
        unsigned i;
        int rc;
 
@@ -689,7 +786,7 @@ static void install_links(const char *busybox, int 
use_symbolic_links,
        for (i = 0; i < ARRAY_SIZE(applet_main); i++) {
                fpc = concat_path_file(
                                custom_install_dir ? custom_install_dir : 
install_dir[APPLET_INSTALL_LOC(i)],
-                               appname);
+                               expand_name(appname));
                // debug: bb_error_msg("%slinking %s to busybox",
                //              use_symbolic_links ? "sym" : "", fpc);
                rc = lf(busybox, fpc);
@@ -760,7 +857,8 @@ static int busybox_main(char **argv)
                /* prevent last comma to be in the very last pos */
                output_width--;
                while (*a) {
-                       int len2 = strlen(a) + 2;
+                       const char *name = expand_name(a);
+                       int len2 = strlen(name) + 2;
                        if (col >= (int)output_width - len2) {
                                full_write2_str(",\n");
                                col = 0;
@@ -771,9 +869,10 @@ static int busybox_main(char **argv)
                        } else {
                                full_write2_str(", ");
                        }
-                       full_write2_str(a);
+                       full_write2_str(name);
                        col += len2;
-                       a += len2 - 1;
+                       while (*a++ != '\0')
+                               continue;
                }
                full_write2_str("\n\n");
                return 0;
@@ -788,7 +887,7 @@ static int busybox_main(char **argv)
                        if (argv[1][6]) /* --list-full? */
                                
full_write2_str(install_dir[APPLET_INSTALL_LOC(i)] + 1);
 # endif
-                       full_write2_str(a);
+                       full_write2_str(expand_name(a));
                        full_write2_str("\n");
                        i++;
                        while (*a++ != '\0')
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index 3e62f46..83c13ae 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -780,8 +780,12 @@ static NOINLINE unsigned complete_cmd_dir_file(const char 
*command, int type)
                const char *p = applet_names;
 
                while (*p) {
-                       if (strncmp(pfind, p, pf_len) == 0)
-                               add_match(xstrdup(p));
+                       const char *name = expand_name(p);
+                       int ret = strncmp(pfind, name, pf_len);
+                       if (ret < 0)
+                               break;
+                       else if (ret == 0)
+                               add_match(xstrdup(name));
                        while (*p++ != '\0')
                                continue;
                }
diff --git a/shell/ash.c b/shell/ash.c
index da9c950..7800f32 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -12592,7 +12592,7 @@ helpcmd(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
        {
                const char *a = applet_names;
                while (*a) {
-                       col += out1fmt("%c%s", ((col == 0) ? '\t' : ' '), a);
+                       col += out1fmt("%c%s", ((col == 0) ? '\t' : ' '), 
expand_name(a));
                        if (col > 60) {
                                out1fmt("\n");
                                col = 0;
_______________________________________________
busybox mailing list
[email protected]
http://lists.busybox.net/mailman/listinfo/busybox

Reply via email to