Hi,

I'm attaching my benchmark with pvanhoof's, luis's, hpj's, and
mine implementations. To compile, use -DWHICH=behdad or similar,
to choose implementation.

I've posted my analysis here:

  http://mces.blogspot.com/2005/11/false-alarm-on-gutf8offsettopointer.html


I suggest we close this discussion and let the current
implementation be there, go optimize its users.

Cheers,

--behdad
http://behdad.org/

"Commandment Three says Do Not Kill, Amendment Two says Blood Will Spill"
        -- Dan Bern, "New American Language"
/*
 * Copyright (C) 2005 Federico Mena-Quintero [EMAIL PROTECTED]
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/times.h>
#include <glib.h>

#define ALL_LANGUAGES "ALL"
#define DEFAULT_DATA_DIR "po-data"
#define DEFAULT_BENCHMARK_NAME "glib benchmark"
#define DEFAULT_NUM_ITERATIONS 1000

static void
error_and_exit (const char *msg, ...)
{
        va_list args;

        va_start (args, msg);
        vfprintf (stderr, msg, args);
        va_end (args);
        fputs ("\n", stderr);
        exit (1);
}

typedef struct {
        clock_t start_utime;
} UserTimer;

UserTimer *
user_timer_new (void)
{
        UserTimer *utimer;
        struct tms tms;

        utimer = g_new0 (UserTimer, 1);
        times (&tms);
        utimer->start_utime = tms.tms_utime;

        return utimer;
}

double
user_timer_elapsed (UserTimer *utimer)
{
        static long clktck;
        struct tms tms;

        if (clktck == 0)
                clktck = sysconf (_SC_CLK_TCK);

        times (&tms);
        return (double) (tms.tms_utime - utimer->start_utime) / clktck;
}

void
user_timer_destroy (UserTimer *utimer)
{
        g_free (utimer);
}

typedef struct {
        char *str;
        int num_chars;
        int num_bytes;
        gboolean valid;
} String;

typedef struct {
        gsize num_strings;
        char *strings_raw;
        String *strings;
} StringSet;

typedef struct {
        double elapsed;
        long total_strings;
        long total_chars;
} LanguageResults;

static StringSet *
string_set_read (const char *filename)
{
        GError *error;
        gsize length;
        char *end;
        char *p, *string_start;
        gsize max_strings;
        char *strings_raw;
        gsize num_strings;
        String *strings;
        StringSet *set;

        error = NULL;
        if (!g_file_get_contents (filename, &strings_raw, &length, &error))
                error_and_exit ("Could not read the strings file %s: %s", 
filename, error->message);

        max_strings = 1024;
        num_strings = 0;
        strings = g_new (String, max_strings);

        string_start = strings_raw;
        end = strings_raw + length;

        for (p = strings_raw; p < end; p++)
                if (*p == 0) {
                        if (num_strings == max_strings) {
                                max_strings = max_strings * 2;
                                strings = g_renew (String, strings, 
max_strings);
                        }

                        strings[num_strings].str = string_start;
                        strings[num_strings].num_chars = g_utf8_strlen 
(string_start, -1);
                        strings[num_strings].num_bytes = strlen (string_start);
                        strings[num_strings].valid = (strstr (string_start, 
"POT-Creation") == 0
                                                      && g_utf8_validate 
(string_start, -1, NULL));
                        string_start = p + 1;
                        num_strings++;
                }

        set = g_new (StringSet, 1);
        set->num_strings = num_strings;
        set->strings_raw = strings_raw;
        set->strings = strings;

        return set;
}

static void
string_set_free (StringSet *set)
{
        g_free (set->strings);
        g_free (set->strings_raw);
        g_free (set);
}


/* PUT IMPLEMENTATIONS HERE */

static const gchar utf8_skip_data[256] = {
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};
const gchar * const g_utf8_skip = utf8_skip_data;
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
gchar *
glib_utf8_offset_to_pointer  (const gchar *str,
                           glong        offset)    
{
  const gchar *s = str;
  while (offset--)
    s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

gchar *
glib_unwrapped_utf8_offset_to_pointer  (const gchar *str,
                           glong        offset)    
{
  const gchar *s = str;
  while (offset>=4) {
    offset -= 4;
    s = g_utf8_next_char (s);
    s = g_utf8_next_char (s);
    s = g_utf8_next_char (s);
    s = g_utf8_next_char (s);
  }
  while (offset--)
    s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

gchar *
pvanhoof_utf8_offset_to_pointer  (const gchar *str,
                           glong        offset)    
{
  const gchar *s = str;
  while (offset--)
    if (*(guchar*)s<192) s++;
    else s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

gchar *
pvanhoof_unwrapped_utf8_offset_to_pointer  (const gchar *str,
                           glong        offset)    
{
  const guchar *s = (const guchar *)str;
  while (offset>=4) {
    offset -= 4;
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
  }
  while (offset--)
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

#define WORDTYPE guint
#define REPEAT(x) (((WORDTYPE)-1 / 0xFF) * (x))

gchar *
pvanhoof_unwrapped2_utf8_offset_to_pointer  (const gchar *str,
                           glong        offset)    
{
  const guchar *s = (const guchar *)str;
  while (offset>=4) {
    offset -= 4;
    if (*(guint32 *)s & REPEAT(0xC0)) {
      if (*s<192) s++;
      else s = g_utf8_next_char (s);
      if (*s<192) s++;
      else s = g_utf8_next_char (s);
      if (*s<192) s++;
      else s = g_utf8_next_char (s);
      if (*s<192) s++;
      else s = g_utf8_next_char (s);
    } else 
      s += 4;
  }
  while (offset--)
    if (*s<192) s++;
    else s = g_utf8_next_char (s);
  
  return (gchar *)s;
}

gchar*
behdad_utf8_offset_to_pointer (const gchar *str, glong        offset)
{
  const WORDTYPE *ws;
  const gchar *s;;

  ws = (const WORDTYPE *)str;
  while (offset >= sizeof (WORDTYPE)) {
    register WORDTYPE w = *ws++;
    offset -= sizeof (WORDTYPE);
    w &= ~(w << 1);
    w &= REPEAT(0x80);
    w >>=7;
    w *= REPEAT(1);
    w >>= (sizeof (WORDTYPE) - 1) * 8;
    offset += w;
  }

  s = (const gchar *)ws;
  while ((*(const guchar*)s)>>6==2)
    s++;
  while (offset--)
    s = g_utf8_next_char (s);

  return s;
}

gchar*
luis_utf8_offset_to_pointer (const gchar *str, glong        offset)
{
        while (offset)
        {
                if (((guchar)(*++str) >> 6) != 0x02)
                        --offset ;
        }
        
        return (gchar *)str;
}

gchar*
luis_unwrapped_utf8_offset_to_pointer (const gchar *str, glong        offset)
{
        while (offset>=4)
        {
                if (((guchar)(*++str) >> 6) != 0x02)
                        --offset ;
                if (((guchar)(*++str) >> 6) != 0x02)
                        --offset ;
                if (((guchar)(*++str) >> 6) != 0x02)
                        --offset ;
                if (((guchar)(*++str) >> 6) != 0x02)
                        --offset ;
        }
        while (offset)
        {
                if (((guchar)(*++str) >> 6) != 0x02)
                        --offset ;
        }
        
        return (gchar *)str;
}

gchar*
hpj_utf8_offset_to_pointer (const gchar *str, glong        offset)
{
  union
  {
    const guint32 *p32;
    const gchar   *p8;
  }
  pt;
 
  pt.p8 = str;
 
  while (offset >= 4)
    {
      guint32 seg    = *pt.p32;
      guint32 seg_hi = seg & 0x80808080;
 
      if (!seg_hi)
        {
          pt.p32++;
          offset -= 4;
          continue;
        }
      else if G_LIKELY (seg_hi == 0x80808080)
        {
          if ((seg & 0x80e080e0) == 0x80c080c0)
            {
              pt.p32++;
              offset -= 2;
              continue;
            }
 
          if ((seg & 0xf08080f0) == 0xe08080e0)
            {
              pt.p8 += 6;
              offset -= 2;
              continue;
            }
        }
 
      pt.p8 += g_utf8_skip [(guchar) seg];
      offset--;
    }
 
  for ( ; offset; offset--)
    pt.p8 = g_utf8_next_char (pt.p8);
 
  return (gchar *) pt.p8;
}



/****************************/

#define JOIN_(a,b) a##b
#define JOIN(a,b) JOIN_(a,b)

static void
measure_strings (StringSet *set, LanguageResults *results, int num_iters)
{
        int i, j;
        UserTimer *utimer;
        gchar*  (*utf8_offset_to_pointer) (const gchar *str, glong        
offset);

        /* choose implementation */
        utf8_offset_to_pointer = JOIN(WHICH,_utf8_offset_to_pointer);


        utimer = user_timer_new ();

        results->elapsed = 0.0;
        results->total_strings = 0;
        results->total_chars = 0;

        for (i = 0; i < num_iters; i++)
                for (j = 0; j < set->num_strings; j++) {

                        if (set->strings[j].valid) {
                                gchar *str;
                                int offset;
                                gchar *pointer;
                                
                                str = set->strings[j].str;
                                /*
                                offset = set->strings[j].num_chars - 1;
                                if (offset < 0)
                                  continue;
                                pointer = g_utf8_find_prev_char (str, str + 
set->strings[j].num_bytes);
                                */
                                offset = set->strings[j].num_chars;
                                pointer = str + set->strings[j].num_bytes;


                                gchar *p = utf8_offset_to_pointer (str, offset);
                                if (p != pointer) {
                                  error_and_exit ("ERROR: expected %d, got 
%d\n", pointer-str, p-str);
                                }

                                results->total_strings++;
                                results->total_chars += 
set->strings[j].num_chars;
                        }
                }

        results->elapsed = user_timer_elapsed (utimer);
        user_timer_destroy (utimer);
}

static char **option_langs;
static char *option_data_dir = DEFAULT_DATA_DIR;
static char *option_name = DEFAULT_BENCHMARK_NAME;
static char *option_output;
static int option_num_iterations = DEFAULT_NUM_ITERATIONS;

static FILE *output_file;

static GOptionEntry option_entries[] = {
        { "lang", 'l', 0, G_OPTION_ARG_STRING_ARRAY, &option_langs,
          "Specify language name (e.g. \"es\" for Spanish), or \"" 
ALL_LANGUAGES "\"", "string" },
        { "data-dir", 'd', 0, G_OPTION_ARG_FILENAME, &option_data_dir,
          "Directory where .dat files live", "dirname" },
        { "name", 'n', 0, G_OPTION_ARG_STRING, &option_name,
          "Name for benchmark", "string" },
        { "output", 'o', 0, G_OPTION_ARG_FILENAME, &option_output,
          "Output filename.  If not specified, standard output will be used.", 
"filename" },
        { NULL, 0, 0, 0, NULL, NULL, NULL }
};

static void
run_one_language (const char *lang_name, const char *filename)
{
        StringSet *set;
        LanguageResults results;

        fprintf (stderr, "Processing %s\n", filename);

        set = string_set_read (filename);
        measure_strings (set, &results, option_num_iterations);
        string_set_free (set);

        fprintf (output_file, "  <language>\n");
        fprintf (output_file, "    <name>%s</name>\n", lang_name);
        fprintf (output_file, "    <elapsed>%f</elapsed>\n", results.elapsed);
        fprintf (output_file, "    <total_strings>%ld</total_strings>\n", 
results.total_strings);
        fprintf (output_file, "    <total_chars>%ld</total_chars>\n", 
results.total_chars);
        fprintf (output_file, "  </language>\n");
}

static void
run_all_languages (void)
{
        GDir *dir;
        GError *error;
        const char *entry;

        error = NULL;
        dir = g_dir_open (option_data_dir, 0, &error);
        if (!dir)
                error_and_exit ("Could not open directory: %s", error->message);

        while ((entry = g_dir_read_name (dir)) != NULL) {
                if (g_str_has_suffix (entry, ".dat")) {
                        char *lang_name;
                        char *filename;

                        lang_name = g_strndup (entry, strlen (entry) - 4);
                        filename = g_build_filename (option_data_dir, entry, 
NULL);

                        run_one_language (lang_name, filename);

                        g_free (lang_name);
                        g_free (filename);
                }
        }

        g_dir_close (dir);
}

static void
run_some_languages (void)
{
        char **langs;

        for (langs = option_langs; *langs; langs++) {
                char *raw_filename;
                char *filename;

                raw_filename = g_strconcat (*langs, ".dat", NULL);
                filename = g_build_filename (option_data_dir, raw_filename, 
NULL);
                g_free (raw_filename);

                run_one_language (*langs, filename);
                g_free (filename);
        }
}

static gboolean
have_all_languages (char **langs)
{
        for (; *langs; langs++)
                if (strcmp (*langs, "ALL") == 0)
                        return TRUE;

        return FALSE;
}

int
main (int argc, char **argv)
{
        GOptionContext *option_ctx;
        
        option_ctx = g_option_context_new ("Options");
        g_option_context_add_main_entries (option_ctx, option_entries, NULL);
        if (!g_option_context_parse (option_ctx, &argc, &argv, NULL)) {
                fprintf (stderr, "Invalid usage; type \"%s --help\" for 
instructions.\n", argv[0]);
                exit (EXIT_FAILURE);
        }

        if (option_output) {
                output_file = fopen (option_output, "w");
                if (!output_file)
                        error_and_exit ("Could not create output file %s", 
option_output);
        } else
                output_file = stdout;

        fputs ("<?xml version=\"1.0\"?>\n", output_file);
        fputs ("<pango-benchmark>\n", output_file);
        fprintf (output_file, "  <name>%s</name>\n", option_name);

        if (option_langs == NULL || have_all_languages (option_langs))
                run_all_languages ();
        else
                run_some_languages ();

        fputs ("</pango-benchmark>\n", output_file);

        if (output_file != stdout)
                fclose (output_file);

        return 0;
}
_______________________________________________
Performance-list mailing list
[email protected]
http://mail.gnome.org/mailman/listinfo/performance-list

Reply via email to