Such version with macros and includes can look like this:

#ifdef MULTIBYTE
#define NEXT_X (x+= char_lens[i-1])
#define NEXT_Y (y+= y_char_len)
#define CMP (char_cmp(x, char_lens[i-1], y, y_char_len))
#else
#define NEXT_X (x++)
#define NEXT_Y (y++)
#define CMP (*x == *y)
#endif

static int
levenshtein_internal(text *s, text *t, int ins_c, int del_c, int sub_c)
{
    int            m,
                n,
                d;
    int           *prev;
    int           *curr;
    int            i,
                j;
    const char *x;
    const char *y;
#ifdef MULTIBYTE
    int           *char_lens;
    int        y_char_len;
#endif

    /*
     * We should calculate number of characters for multibyte encodings
     */
    m = pg_mbstrlen_with_len(VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s));
    n = pg_mbstrlen_with_len(VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));

    /*
     * We can transform an empty s into t with n insertions, or a non-empty
t
     * into an empty s with m deletions.
     */
    if (m == 0)
        return n * ins_c;
    if (n == 0)
        return m * del_c;

    /*
     * For security concerns, restrict excessive CPU+RAM usage. (This
     * implementation uses O(m) memory and has O(mn) complexity.)
     */
    if (m > MAX_LEVENSHTEIN_STRLEN ||
        n > MAX_LEVENSHTEIN_STRLEN)
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                 errmsg("argument exceeds the maximum length of %d bytes",
                        MAX_LEVENSHTEIN_STRLEN)));

    /* One more cell for initialization column and row. */
    ++m;
    ++n;

    /*
     * Instead of building an (m+1)x(n+1) array, we'll use two different
     * arrays of size m+1 for storing accumulated values. At each step one
     * represents the "previous" row and one is the "current" row of the
     * notional large array.
     * For multibyte encoding we'll also store array of lengths of
     * characters of first string in order to avoid great number of
      * pg_mblen calls.
     */
#ifdef MULTIBYTE
    prev = (int *) palloc(3 * m * sizeof(int));
    curr = prev + m;
    char_lens = prev + 2 * m;
    for (i = 0, x = VARDATA_ANY(s); i < m - 1; i++)
    {
        char_lens[i] = pg_mblen(x);
        x += char_lens[i];
    }
    char_lens[i] = 0;
#else
    prev = (int *) palloc(2 * m * sizeof(int));
    curr = prev + m;
#endif

    /* Initialize the "previous" row to 0..cols */
    for (i = 0; i < m; i++)
        prev[i] = i * del_c;

    /*
     * For multibyte encoding we should increase x and y pointers by the
     * results of pg_mblen function. Also we should use CHAR_CMP macros
     * for character comparison.
     */
    for (y = VARDATA_ANY(t), j = 1; j < n; NEXT_Y, j++)
    {
        int           *temp;
#ifdef MULTIBYTE
        y_char_len = pg_mblen(y);
#endif

        curr[0] = j * ins_c;

        for (x = VARDATA_ANY(s), i = 1; i < m; NEXT_X, i++)
        {
            int            ins;
            int            del;
            int            sub;

            ins = prev[i] + ins_c;
            del = curr[i - 1] + del_c;
            sub = prev[i - 1] + (CMP ? 0 : sub_c);
            curr[i] = Min(ins, del);
            curr[i] = Min(curr[i], sub);
        }

        temp = curr;
        curr = prev;
        prev = temp;
    }

    d = prev[m - 1];

    /*
     * Because the final value was swapped from the previous row to the
     * current row, that's where we'll find it.
     */
    return d;
}

What do you thing about it?

----
With best regards,
Alexander Korotkov.

Reply via email to