Hi Chet,

The rev utility is often used in conjunction with the cut utility which is a
loadable builtin, so here is a loadable builtin rev.

This rev is re-engineered to not use wide character functions. Instead it
processes utf8 multibyte characters at the byte level to preserve these
characters in the reversed line.

Array handling is modelled on the cut builtin.

This patch is against devel HEAD.

Performance:

Tests were done with 2GiB files: 2 were 100% single-byte characters, the others
being 100% multibyte (except newlines). 2 files had 1024B lines, the others
being 64B. This gave a total of 4 files. The benchmark was rev from
util-linux-2.41.3.

The stand-alone prototype was faster in every case. Part of converting to
loadable was to replace read(2) calls with zgetline(bash). This degraded
performance somewhat. Here are the numbers ('x' means "times as fast"):
 64B multi: 0.4x; 1024B multi: 0.6x; 64B single: 1.1x; 1024B single: 1.6x

One expects the builtin will always out-perform the external utility with short
files.

Cheers ... Duncan.

--

diff --git a/examples/loadables/Makefile.in b/examples/loadables/Makefile.in
index 6e8b635b..6265800b 100644
--- a/examples/loadables/Makefile.in
+++ b/examples/loadables/Makefile.in
@@ -103,7 +103,7 @@ INC = -I. -I.. -I$(topdir) -I$(topdir)/lib 
-I$(topdir)/builtins -I${srcdir} \
 ALLPROG = print truefalse sleep finfo logname basename dirname fdflags \
          tty pathchk tee head mkdir rmdir mkfifo mktemp printenv id whoami \
          uname sync push ln unlink realpath strftime mypid setpgid seq rm \
-         accept csv dsv cut stat getconf kv strptime chmod fltexpr jobid
+         accept csv dsv cut stat getconf kv strptime chmod fltexpr jobid rev
 OTHERPROG = necho hello cat pushd asort
 
 SUBDIRS = perl
@@ -259,6 +259,9 @@ fltexpr:    fltexpr.o
 jobid: jobid.o
        $(SHOBJ_LD) $(SHOBJ_LDFLAGS) $(SHOBJ_XLDFLAGS) -o $@ jobid.o 
$(SHOBJ_LIBS)
 
+rev:   rev.o
+       $(SHOBJ_LD) $(SHOBJ_LDFLAGS) $(SHOBJ_XLDFLAGS) -o $@ rev.o $(SHOBJ_LIBS)
+
 
 # pushd is a special case.  We use the same source that the builtin version
 # uses, with special compilation options.
@@ -325,7 +328,7 @@ OBJS = print.o truefalse.o accept.o sleep.o finfo.o 
getconf.o logname.o \
        basename.o dirname.o tty.o pathchk.o tee.o head.o rmdir.o necho.o \
        hello.o cat.o csv.o dsv.o kv.o cut.o printenv.o id.o whoami.o uname.o \
        sync.o push.o mkdir.o mktemp.o realpath.o strftime.o setpgid.o stat.o \
-       fdflags.o seq.o asort.o strptime.o chmod.o fltexpr.o jobid.o
+       fdflags.o seq.o asort.o strptime.o chmod.o fltexpr.o jobid.o rev.o
 
 ${OBJS}:       ${BUILD_DIR}/config.h
 
@@ -369,3 +372,4 @@ asort.o: asort.c
 strptime.o: strptime.c
 fltexpr.o: fltexpr.c
 jobid.o: jobid.c
+rev.o: rev.c
diff --git a/examples/loadables/rev.c b/examples/loadables/rev.c
new file mode 100644
index 00000000..e71ff7c0
--- /dev/null
+++ b/examples/loadables/rev.c
@@ -0,0 +1,259 @@
+/* Headers */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <setjmp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "shmbutil.h"
+#include "loadables.h"
+#include <array.h>                 /* Has to go after stdint & loadables (!) */
+
+/* Macros */
+
+#define SYSCALL(x, y) do x = y; while(x == -1 && errno == EINTR)
+#define PUTC(x) if (v) *buf++ = x; else fputc(x, stdout)
+
+/* ********************************* getlen ********************************* 
*/
+
+static int
+getlen(char *last_trlg_byte, int num_bytes_left)
+/* Get the length of a UTF-8 sequence */
+/*
+ * If last_trlg_byte is indeed the last byte of a valid UTF-8 multibyte
+ * sequence, return the length of that sequence. Otherwise return 1.
+ *
+ * There can be up to 3 trailing bytes, which must start '10'b and carry 6 bits
+ * of data. The header byte starts with as many 1 bits as there are bytes in 
the
+ * sequence, followed by a 0 bit. The rest of the byte carries data.
+ * As an example, a 4-byte sequence starts '11110'b leaving 3 bits for data.
+ * 3 trailing bytes carry 6 bits each for a total of 21 bits.
+ * UTF-16 can only encode 20 bits, so there are very few 21-bit codepoints.
+ */
+{
+
+  const char mask[5] = { 0200, 0300, 0340, 0360, 0370 };
+  char *p = last_trlg_byte;
+  int n;                           /* Bytes in header + trailer(s) */
+  int i;
+
+  if ((*p-- & mask[1]) != mask[0])
+    goto not_utf_8;
+  n = 2;
+  for (i = num_bytes_left >= 3 ? 3 : num_bytes_left; i > 0; i--, p--, n++)
+  {                                /* 3 more bytes max */
+    if ((*p & mask[1]) != mask[0])
+    {
+      if ((*p & mask[n]) == mask[n - 1])
+        return n;
+      else
+        goto not_utf_8;
+    }                              /* if ((*p-- & mask[1]) != mask[0]) */
+  }
+
+not_utf_8:
+  return 1;
+}                                  /* getlen() */
+
+/* ****************************** reverse_line ****************************** 
*/
+
+static void
+reverse_line(SHELL_VAR *v, arrayind_t *ind, char *line, size_t len,
+  int outputsep, char sep)
+{
+  char *p, *q;
+  char *buf;
+  int i, j;
+#if defined (ARRAY_VARS)
+  if (v)
+  {
+    /*
+     * Bypass extra copies and malloc / free calls by getting a shell var
+     * with NULL value and putting an allocated buffer in it.
+     */
+    bind_array_element (v, (*ind)++, (char *)NULL, 0);
+    buf = xmalloc(len + 1);        /* +1 for NUL */
+    (((ARRAY *)v->value)->lastref)->value = buf;
+    buf[len] = '\0';
+  }                                /* if (v) */
+#endif
+
+  if (locale_utf8locale)
+  {
+    for (i = len, p = line + len - 1; i > 0; i--, p--)
+    {
+      if (*p & 0200)
+      {
+        j = getlen(p, i);
+        p = q = p - (j - 1);       /* p-> 1st byte of seq */
+        i -= (j - 1);              /* Reduce num left by num trlg bytes */
+        for (; j > 0; j--)
+          PUTC(*q++);
+      }                            /* if (*p & 0200) */
+      else
+        PUTC(*p);
+    }                        /* for (i = len, p = line + len - 1; i > 0; i--) 
*/
+  }                                /* if (locale_utf8locale) */
+  else
+  {
+    for (i = len, p = line + len - 1; i > 0; i--)
+      PUTC(*p--);
+  }                                /* if (locale_utf8locale) else */
+  if (outputsep)
+    PUTC(sep);
+}                                  /* reverse_line() */
+
+/* ****************************** rev_internal ****************************** 
*/
+
+static int
+rev_internal(WORD_LIST *list)
+{
+  int unbuffered_read;
+  char *array_name;
+  arrayind_t ind;
+  int outputsep;
+  WORD_LIST *l;
+  SHELL_VAR *v;
+  size_t llen;
+  char *line;
+  size_t n;
+  int rval;
+  char sep;
+  int opt;
+  int fd;
+
+  v = 0;
+  rval = EXECUTION_SUCCESS;
+
+  array_name = 0;
+  sep = '\n';
+  ind = 0;
+
+  reset_internal_getopt();
+  while ((opt = internal_getopt(list, "0:a:h")) != -1)
+    switch (opt)
+    {
+      case '0':
+        sep = '\0';
+        break;
+      case 'a':
+#if defined (ARRAY_VARS)
+        array_name = list_optarg;
+        break;
+#else
+        builtin_error("arrays not available");
+        return (EX_USAGE);
+#endif
+        CASE_HELPOPT;
+      default:
+        builtin_usage();
+        return (EX_USAGE);
+    }
+
+  if (array_name && (valid_identifier(array_name) == 0))
+  {
+    sh_invalidid(array_name);
+    return (EXECUTION_FAILURE);
+  }
+
+#if defined (ARRAY_VARS)
+  if (array_name)
+  {
+    v = builtin_find_indexed_array(array_name, 1);
+    if (v == 0)
+      return (EXECUTION_FAILURE);
+  }
+#endif
+
+  l = loptend;
+  line = 0;
+  llen = 0;
+
+  do
+  {
+/* for each file */
+
+    if (l == 0)
+      fd = 0;
+    else
+      SYSCALL(fd, open(l->word->word, O_RDONLY));
+    if (fd == -1)
+    {
+      file_error(l->word->word);
+      rval = EXECUTION_FAILURE;
+      goto next_file;
+    }
+
+#ifndef __CYGWIN__
+    unbuffered_read = (lseek(fd, 0L, SEEK_CUR) < 0) && (errno == ESPIPE);
+#else
+    unbuffered_read = 1;
+#endif
+
+/* Read from input */
+    while ((n = zgetline(fd, &line, &llen, sep, unbuffered_read)) != -1)
+    {
+      QUIT;
+      if (line[n] == sep)
+        outputsep = 1;
+      else
+      {
+        outputsep = 0;
+        n++;           /* Work around zgetline behaviour on unterminated line 
*/
+      }
+      reverse_line(v, &ind, line, n, outputsep, sep);
+    }                              /* while ((n = zgetline(...) !=-1) */
+    if (fd != 0)
+      close(fd);
+
+  next_file:
+    QUIT;
+    if (l)
+      l = l->next;
+  }                                /* do */
+  while (l);
+
+  free(line);
+  return rval;
+}                                  /* rev_internal() */
+
+/* ********************************** main ********************************** 
*/
+
+int
+rev_builtin(WORD_LIST *list)
+{
+  return rev_internal(list);
+}                                  /* main() */
+
+char *rev_doc[] = {
+  "Reverse lines characterwise.",
+  "",
+  "Copy the lines of the specified files to standard output,",
+  "or assign them to the indexed array ARRAY starting at index 0,",
+  "reversing the order of characters in every line.",
+  "If no files are specified, standard input is read.",
+  "",
+  "When -0 is specified, use the byte '\\0' as line separator.",
+  "",
+  "When -a is specified, assign each reversed line"
+    "to successive elements of ARRAY,",
+  "beginning at 0.",
+  "The lines rev assigns to ARRAY are identical to the lines it would",
+  "write to the standard output if -a were not supplied.",
+  "",
+  "This utility processes UTF-8 without using a wide-character buffer.",
+  (char *)NULL
+};
+
+struct builtin rev_struct = {
+  "rev",                           /* builtin name */
+  rev_builtin,                     /* function implementing the builtin */
+  BUILTIN_ENABLED,                 /* initial flags for builtin */
+  rev_doc,                         /* array of long documentation strings */
+  "rev [-0] [-a ARRAY] [file ...]", /* usage synopsis; becomes short_doc */
+  0                                /* reserved for internal use */
+};

Reply via email to