https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88793

--- Comment #2 from Florian Weimer <fw at gcc dot gnu.org> ---
(In reply to Alexander Monakov from comment #1)
> (In reply to Florian Weimer from comment #0)
> > However, optimizing for size is a very big hammer and causes substantial
> > performance issues on i386 and x86-64 due to string function inlining.  As a
> > result, the cold attribute is only suitable for code that is basically never
> > executed.  For other cases, like repeated execution which only happens in an
> > unlikely configuration, it is inappropriate.
> 
> Can you please clarify exactly what issues? If you mean inline 'rep
> stosb/movsb', their overhead is on the order of 30 cycles, and I don't see
> what is inappropriate about it.

The startup overhead isn't the problem.  The asymptotic performance is really
bad, too.  (I hope I didn't botch my test, though.  It's vaguely based on
what's attached to the downstream bug.)

For len == 5000, I get a factor of 60 difference in favor of glibc 2.28's
strlen.  For len == 30, it's still a factor of 11 in favor of strlen.  This is
on a machine with a i7-8650U, so a fairly recent CPU with erms.

#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

static size_t len = 30;
static char *buffer;
static int count = 10000;

__attribute__ ((weak))
size_t
call_strlen (const char *s)
{
  return strlen (s);
}

__attribute__ ((weak, cold))
size_t
slow_strlen (const char *s)
{
  return strlen (s);
}

__attribute__ ((weak))
size_t
call_slow_strlen (const char *s)
{
  return slow_strlen (s);
}

static void
bench (const char *what, size_t (*fptr) (const char *))
{
  struct timespec start;
  if (clock_gettime (CLOCK_MONOTONIC, &start) != 0)
    err (1, "clock_gettime");

  for (int i = 0; i < count; ++i)
    fptr (buffer);

  struct timespec end;
  if (clock_gettime (CLOCK_MONOTONIC, &end) != 0)
    err (1, "clock_gettime");

  double delta = (end.tv_sec - start.tv_sec) * 1e9;
  delta += end.tv_nsec - start.tv_nsec;
  printf ("%s: %f ns/call\n", what, delta / count);
}

int
main (void)
{
  buffer = malloc (len + 1);
  if (buffer == NULL)
    err (1, "malloc");
  memset (buffer, 'a', len);
  buffer[len] = 0;

  for (int i = 0; i < 10; i++)
    {
      bench ("strlen", call_strlen);
      bench ("strlen (cold)", call_slow_strlen);
    }

  free (buffer);

  return 0;
}

Reply via email to