On Fri, Nov 7, 2014 at 12:57 PM, Robert Haas <robertmh...@gmail.com> wrote: > Based on this review from a month ago, I'm going to mark this Waiting > on Author. If nobody updates the patch in a few days, I'll mark it > Returned with Feedback. Thanks.
Attached revision fixes the compiler warning that Heikki complained about. I maintain SQL-callable stub functions from within contrib, rather than follow Michael's approach. In other words, very little has changed from my revision from July last [1]. Reminder: I maintain a slight preference for only offering one suggestion per relation RTE, which is what this revision does (so no change there). If a committer who picks this up wants me to alter that, I don't mind doing so; since only Michael spoke up on this, I've kept things my way. This is not a completion mechanism; it is supposed to work on *complete* column references with slight misspellings (e.g. incorrect use of plurals, or column references with an omitted underscore character). Weighing Tom's concerns about suggestions that are of absolute low quality is what makes me conclude that this is the thing to do. [1] http://www.postgresql.org/message-id/CAM3SWZTzQO=oy4jmfb-65iefie8ihukderk-0oljetm8dsr...@mail.gmail.com -- Peter Geoghegan
From 830bf9f668972ba6b531df5d4fcbd73db3472434 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan <p...@heroku.com> Date: Sat, 30 Nov 2013 23:15:00 -0800 Subject: [PATCH] Levenshtein distance column HINT Add a new HINT -- a guess as to what column the user might have intended to reference, to be shown in various contexts where an ERRCODE_UNDEFINED_COLUMN error is raised. The user will see this HINT when he or she fat-fingers a column reference in his or her ad-hoc SQL query, or incorrectly pluralizes or fails to pluralize a column reference. The HINT suggests a column in the range table with the lowest Levenshtein distance, or the tied-for-best pair of matching columns in the event of there being exactly two equally likely candidates (iff each candidate column comes from a separate RTE). Limiting the cases where multiple equally likely suggestions are all offered at once is a measure against suggestions that are of low quality in an absolute sense. A further, final measure is taken against suggestions that are of low absolute quality: If the distance exceeds a normalized distance threshold, no suggestion is given. The contrib Levenshtein distance implementation is moved from /contrib to core. However, the SQL-callable functions may only be used with the fuzzystmatch extension installed, just as before -- the fuzzystmatch definitions become mere forwarding stubs. --- contrib/fuzzystrmatch/Makefile | 3 - contrib/fuzzystrmatch/fuzzystrmatch.c | 81 ++++-- contrib/fuzzystrmatch/levenshtein.c | 403 ------------------------------ src/backend/parser/parse_expr.c | 9 +- src/backend/parser/parse_func.c | 2 +- src/backend/parser/parse_relation.c | 319 ++++++++++++++++++++--- src/backend/utils/adt/Makefile | 2 + src/backend/utils/adt/levenshtein.c | 393 +++++++++++++++++++++++++++++ src/backend/utils/adt/varlena.c | 25 ++ src/include/parser/parse_relation.h | 3 +- src/include/utils/builtins.h | 5 + src/test/regress/expected/alter_table.out | 8 + src/test/regress/expected/join.out | 39 +++ src/test/regress/expected/plpgsql.out | 1 + src/test/regress/expected/rowtypes.out | 1 + src/test/regress/expected/rules.out | 1 + src/test/regress/expected/without_oid.out | 1 + src/test/regress/sql/join.sql | 24 ++ 18 files changed, 849 insertions(+), 471 deletions(-) delete mode 100644 contrib/fuzzystrmatch/levenshtein.c create mode 100644 src/backend/utils/adt/levenshtein.c diff --git a/contrib/fuzzystrmatch/Makefile b/contrib/fuzzystrmatch/Makefile index 024265d..0327d95 100644 --- a/contrib/fuzzystrmatch/Makefile +++ b/contrib/fuzzystrmatch/Makefile @@ -17,6 +17,3 @@ top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif - -# levenshtein.c is #included by fuzzystrmatch.c -fuzzystrmatch.o: fuzzystrmatch.c levenshtein.c diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c index 7a53d8a..62e650f 100644 --- a/contrib/fuzzystrmatch/fuzzystrmatch.c +++ b/contrib/fuzzystrmatch/fuzzystrmatch.c @@ -154,23 +154,6 @@ getcode(char c) /* These prevent GH from becoming F */ #define NOGHTOF(c) (getcode(c) & 16) /* BDH */ -/* Faster than memcmp(), for this use case. */ -static inline bool -rest_of_char_same(const char *s1, const char *s2, int len) -{ - while (len > 0) - { - len--; - if (s1[len] != s2[len]) - return false; - } - return true; -} - -#include "levenshtein.c" -#define LEVENSHTEIN_LESS_EQUAL -#include "levenshtein.c" - PG_FUNCTION_INFO_V1(levenshtein_with_costs); Datum levenshtein_with_costs(PG_FUNCTION_ARGS) @@ -180,8 +163,20 @@ levenshtein_with_costs(PG_FUNCTION_ARGS) int ins_c = PG_GETARG_INT32(2); int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); - - PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_leven(s_data, s_bytes, t_data, t_bytes, ins_c, + del_c, sub_c)); } @@ -191,8 +186,20 @@ levenshtein(PG_FUNCTION_ARGS) { text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); - - PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_leven(s_data, s_bytes, t_data, t_bytes, 1, 1, + 1)); } @@ -206,8 +213,20 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS) int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); int max_d = PG_GETARG_INT32(5); - - PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_leven_less_equal(s_data, s_bytes, t_data, t_bytes, + ins_c, del_c, sub_c, max_d)); } @@ -218,8 +237,20 @@ levenshtein_less_equal(PG_FUNCTION_ARGS) text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); int max_d = PG_GETARG_INT32(2); - - PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d)); + const char *s_data; + const char *t_data; + int s_bytes, + t_bytes; + + /* Extract a pointer to the actual character data */ + s_data = VARDATA_ANY(src); + t_data = VARDATA_ANY(dst); + /* Determine length of each string in bytes and characters */ + s_bytes = VARSIZE_ANY_EXHDR(src); + t_bytes = VARSIZE_ANY_EXHDR(dst); + + PG_RETURN_INT32(varstr_leven_less_equal(s_data, s_bytes, t_data, t_bytes, + 1, 1, 1, max_d)); } diff --git a/contrib/fuzzystrmatch/levenshtein.c b/contrib/fuzzystrmatch/levenshtein.c deleted file mode 100644 index 4f37a54..0000000 --- a/contrib/fuzzystrmatch/levenshtein.c +++ /dev/null @@ -1,403 +0,0 @@ -/* - * levenshtein.c - * - * Functions for "fuzzy" comparison of strings - * - * Joe Conway <m...@joeconway.com> - * - * Copyright (c) 2001-2014, PostgreSQL Global Development Group - * ALL RIGHTS RESERVED; - * - * levenshtein() - * ------------- - * Written based on a description of the algorithm by Michael Gilleland - * found at http://www.merriampark.com/ld.htm - * Also looked at levenshtein.c in the PHP 4.0.6 distribution for - * inspiration. - * Configurable penalty costs extension is introduced by Volkan - * YAZICI <volkan.yaz...@gmail.com>. - */ - -/* - * External declarations for exported functions - */ -#ifdef LEVENSHTEIN_LESS_EQUAL -static int levenshtein_less_equal_internal(text *s, text *t, - int ins_c, int del_c, int sub_c, int max_d); -#else -static int levenshtein_internal(text *s, text *t, - int ins_c, int del_c, int sub_c); -#endif - -#define MAX_LEVENSHTEIN_STRLEN 255 - - -/* - * Calculates Levenshtein distance metric between supplied strings. Generally - * (1, 1, 1) penalty costs suffices for common cases, but your mileage may - * vary. - * - * One way to compute Levenshtein distance is to incrementally construct - * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number - * of operations required to transform the first i characters of s into - * the first j characters of t. The last column of the final row is the - * answer. - * - * We use that algorithm here with some modification. In lieu of holding - * the entire array in memory at once, we'll just use two arrays of size - * m+1 for storing accumulated values. At each step one array represents - * the "previous" row and one is the "current" row of the notional large - * array. - * - * If max_d >= 0, we only need to provide an accurate answer when that answer - * is less than or equal to the bound. From any cell in the matrix, there is - * theoretical "minimum residual distance" from that cell to the last column - * of the final row. This minimum residual distance is zero when the - * untransformed portions of the strings are of equal length (because we might - * get lucky and find all the remaining characters matching) and is otherwise - * based on the minimum number of insertions or deletions needed to make them - * equal length. The residual distance grows as we move toward the upper - * right or lower left corners of the matrix. When the max_d bound is - * usefully tight, we can use this property to avoid computing the entirety - * of each row; instead, we maintain a start_column and stop_column that - * identify the portion of the matrix close to the diagonal which can still - * affect the final answer. - */ -static int -#ifdef LEVENSHTEIN_LESS_EQUAL -levenshtein_less_equal_internal(text *s, text *t, - int ins_c, int del_c, int sub_c, int max_d) -#else -levenshtein_internal(text *s, text *t, - int ins_c, int del_c, int sub_c) -#endif -{ - int m, - n, - s_bytes, - t_bytes; - int *prev; - int *curr; - int *s_char_len = NULL; - int i, - j; - const char *s_data; - const char *t_data; - const char *y; - - /* - * For levenshtein_less_equal_internal, we have real variables called - * start_column and stop_column; otherwise it's just short-hand for 0 and - * m. - */ -#ifdef LEVENSHTEIN_LESS_EQUAL - int start_column, - stop_column; - -#undef START_COLUMN -#undef STOP_COLUMN -#define START_COLUMN start_column -#define STOP_COLUMN stop_column -#else -#undef START_COLUMN -#undef STOP_COLUMN -#define START_COLUMN 0 -#define STOP_COLUMN m -#endif - - /* Extract a pointer to the actual character data. */ - s_data = VARDATA_ANY(s); - t_data = VARDATA_ANY(t); - - /* Determine length of each string in bytes and characters. */ - s_bytes = VARSIZE_ANY_EXHDR(s); - t_bytes = VARSIZE_ANY_EXHDR(t); - m = pg_mbstrlen_with_len(s_data, s_bytes); - n = pg_mbstrlen_with_len(t_data, t_bytes); - - /* - * We can transform an empty s into t with n insertions, or a non-empty t - * into an empty s with m deletions. - */ - if (!m) - return n * ins_c; - if (!n) - return m * del_c; - - /* - * For security concerns, restrict excessive CPU+RAM usage. (This - * implementation uses O(m) memory and has O(mn) complexity.) - */ - if (m > MAX_LEVENSHTEIN_STRLEN || - n > MAX_LEVENSHTEIN_STRLEN) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("argument exceeds the maximum length of %d bytes", - MAX_LEVENSHTEIN_STRLEN))); - -#ifdef LEVENSHTEIN_LESS_EQUAL - /* Initialize start and stop columns. */ - start_column = 0; - stop_column = m + 1; - - /* - * If max_d >= 0, determine whether the bound is impossibly tight. If so, - * return max_d + 1 immediately. Otherwise, determine whether it's tight - * enough to limit the computation we must perform. If so, figure out - * initial stop column. - */ - if (max_d >= 0) - { - int min_theo_d; /* Theoretical minimum distance. */ - int max_theo_d; /* Theoretical maximum distance. */ - int net_inserts = n - m; - - min_theo_d = net_inserts < 0 ? - -net_inserts * del_c : net_inserts * ins_c; - if (min_theo_d > max_d) - return max_d + 1; - if (ins_c + del_c < sub_c) - sub_c = ins_c + del_c; - max_theo_d = min_theo_d + sub_c * Min(m, n); - if (max_d >= max_theo_d) - max_d = -1; - else if (ins_c + del_c > 0) - { - /* - * Figure out how much of the first row of the notional matrix we - * need to fill in. If the string is growing, the theoretical - * minimum distance already incorporates the cost of deleting the - * number of characters necessary to make the two strings equal in - * length. Each additional deletion forces another insertion, so - * the best-case total cost increases by ins_c + del_c. If the - * string is shrinking, the minimum theoretical cost assumes no - * excess deletions; that is, we're starting no further right than - * column n - m. If we do start further right, the best-case - * total cost increases by ins_c + del_c for each move right. - */ - int slack_d = max_d - min_theo_d; - int best_column = net_inserts < 0 ? -net_inserts : 0; - - stop_column = best_column + (slack_d / (ins_c + del_c)) + 1; - if (stop_column > m) - stop_column = m + 1; - } - } -#endif - - /* - * In order to avoid calling pg_mblen() repeatedly on each character in s, - * we cache all the lengths before starting the main loop -- but if all - * the characters in both strings are single byte, then we skip this and - * use a fast-path in the main loop. If only one string contains - * multi-byte characters, we still build the array, so that the fast-path - * needn't deal with the case where the array hasn't been initialized. - */ - if (m != s_bytes || n != t_bytes) - { - int i; - const char *cp = s_data; - - s_char_len = (int *) palloc((m + 1) * sizeof(int)); - for (i = 0; i < m; ++i) - { - s_char_len[i] = pg_mblen(cp); - cp += s_char_len[i]; - } - s_char_len[i] = 0; - } - - /* One more cell for initialization column and row. */ - ++m; - ++n; - - /* Previous and current rows of notional array. */ - prev = (int *) palloc(2 * m * sizeof(int)); - curr = prev + m; - - /* - * To transform the first i characters of s into the first 0 characters of - * t, we must perform i deletions. - */ - for (i = START_COLUMN; i < STOP_COLUMN; i++) - prev[i] = i * del_c; - - /* Loop through rows of the notional array */ - for (y = t_data, j = 1; j < n; j++) - { - int *temp; - const char *x = s_data; - int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1; - -#ifdef LEVENSHTEIN_LESS_EQUAL - - /* - * In the best case, values percolate down the diagonal unchanged, so - * we must increment stop_column unless it's already on the right end - * of the array. The inner loop will read prev[stop_column], so we - * have to initialize it even though it shouldn't affect the result. - */ - if (stop_column < m) - { - prev[stop_column] = max_d + 1; - ++stop_column; - } - - /* - * The main loop fills in curr, but curr[0] needs a special case: to - * transform the first 0 characters of s into the first j characters - * of t, we must perform j insertions. However, if start_column > 0, - * this special case does not apply. - */ - if (start_column == 0) - { - curr[0] = j * ins_c; - i = 1; - } - else - i = start_column; -#else - curr[0] = j * ins_c; - i = 1; -#endif - - /* - * This inner loop is critical to performance, so we include a - * fast-path to handle the (fairly common) case where no multibyte - * characters are in the mix. The fast-path is entitled to assume - * that if s_char_len is not initialized then BOTH strings contain - * only single-byte characters. - */ - if (s_char_len != NULL) - { - for (; i < STOP_COLUMN; i++) - { - int ins; - int del; - int sub; - int x_char_len = s_char_len[i - 1]; - - /* - * Calculate costs for insertion, deletion, and substitution. - * - * When calculating cost for substitution, we compare the last - * character of each possibly-multibyte character first, - * because that's enough to rule out most mis-matches. If we - * get past that test, then we compare the lengths and the - * remaining bytes. - */ - ins = prev[i] + ins_c; - del = curr[i - 1] + del_c; - if (x[x_char_len - 1] == y[y_char_len - 1] - && x_char_len == y_char_len && - (x_char_len == 1 || rest_of_char_same(x, y, x_char_len))) - sub = prev[i - 1]; - else - sub = prev[i - 1] + sub_c; - - /* Take the one with minimum cost. */ - curr[i] = Min(ins, del); - curr[i] = Min(curr[i], sub); - - /* Point to next character. */ - x += x_char_len; - } - } - else - { - for (; i < STOP_COLUMN; i++) - { - int ins; - int del; - int sub; - - /* Calculate costs for insertion, deletion, and substitution. */ - ins = prev[i] + ins_c; - del = curr[i - 1] + del_c; - sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c); - - /* Take the one with minimum cost. */ - curr[i] = Min(ins, del); - curr[i] = Min(curr[i], sub); - - /* Point to next character. */ - x++; - } - } - - /* Swap current row with previous row. */ - temp = curr; - curr = prev; - prev = temp; - - /* Point to next character. */ - y += y_char_len; - -#ifdef LEVENSHTEIN_LESS_EQUAL - - /* - * This chunk of code represents a significant performance hit if used - * in the case where there is no max_d bound. This is probably not - * because the max_d >= 0 test itself is expensive, but rather because - * the possibility of needing to execute this code prevents tight - * optimization of the loop as a whole. - */ - if (max_d >= 0) - { - /* - * The "zero point" is the column of the current row where the - * remaining portions of the strings are of equal length. There - * are (n - 1) characters in the target string, of which j have - * been transformed. There are (m - 1) characters in the source - * string, so we want to find the value for zp where (n - 1) - j = - * (m - 1) - zp. - */ - int zp = j - (n - m); - - /* Check whether the stop column can slide left. */ - while (stop_column > 0) - { - int ii = stop_column - 1; - int net_inserts = ii - zp; - - if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c : - -net_inserts * del_c) <= max_d) - break; - stop_column--; - } - - /* Check whether the start column can slide right. */ - while (start_column < stop_column) - { - int net_inserts = start_column - zp; - - if (prev[start_column] + - (net_inserts > 0 ? net_inserts * ins_c : - -net_inserts * del_c) <= max_d) - break; - - /* - * We'll never again update these values, so we must make sure - * there's nothing here that could confuse any future - * iteration of the outer loop. - */ - prev[start_column] = max_d + 1; - curr[start_column] = max_d + 1; - if (start_column != 0) - s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; - start_column++; - } - - /* If they cross, we're going to exceed the bound. */ - if (start_column >= stop_column) - return max_d + 1; - } -#endif - } - - /* - * Because the final value was swapped from the previous row to the - * current row, that's where we'll find it. - */ - return prev[m - 1]; -} diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 4a8aaf6..9866198 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -621,7 +621,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field2); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + NULL, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ @@ -666,7 +667,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field3); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + NULL, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ @@ -724,7 +726,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) colname = strVal(field4); /* Try to identify as a column of the RTE */ - node = scanRTEForColumn(pstate, rte, colname, cref->location); + node = scanRTEForColumn(pstate, rte, colname, cref->location, + NULL, NULL); if (node == NULL) { /* Try it as a function call on the whole row */ diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index 9ebd3fd..e128adf 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -1779,7 +1779,7 @@ ParseComplexProjection(ParseState *pstate, char *funcname, Node *first_arg, ((Var *) first_arg)->varno, ((Var *) first_arg)->varlevelsup); /* Return a Var if funcname matches a column, else NULL */ - return scanRTEForColumn(pstate, rte, funcname, location); + return scanRTEForColumn(pstate, rte, funcname, location, NULL, NULL); } /* diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 478584d..1697b77 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -15,6 +15,7 @@ #include "postgres.h" #include <ctype.h> +#include <limits.h> #include "access/htup_details.h" #include "access/sysattr.h" @@ -520,6 +521,22 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup) } /* + * distanceName + * Return Levenshtein distance between an actual column name and possible + * partial match. + */ +static int +distanceName(const char *actual, const char *match, int max) +{ + int len = strlen(actual), + match_len = strlen(match); + + /* Charge half as much per deletion as per insertion or per substitution */ + return varstr_leven_less_equal(actual, len, match, match_len, + 2, 1, 2, max); +} + +/* * scanRTEForColumn * Search the column names of a single RTE for the given name. * If found, return an appropriate Var node, else return NULL. @@ -527,10 +544,24 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup) * * Side effect: if we find a match, mark the RTE as requiring read access * for the column. + * + * For those callers that will settle for a fuzzy match (for the purposes of + * building diagnostic messages), we match the column attribute whose name has + * the lowest Levenshtein distance from colname, setting *closest and + * *distance. Such callers should not rely on the return value (even when + * there is an exact match), nor should they expect the usual side effect + * (unless there is an exact match). This hardly matters in practice, since an + * error is imminent. + * + * If there are two or more attributes in the range table entry tied for + * closest, accurately report the shortest distance found overall, while not + * setting a "closest" attribute on the assumption that only a per-entry single + * closest match is useful. Note that we never consider system column names + * when performing fuzzy matching. */ Node * scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, - int location) + int location, AttrNumber *closest, int *distance) { Node *result = NULL; int attnum = 0; @@ -548,12 +579,16 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, * Should this somehow go wrong and we try to access a dropped column, * we'll still catch it by virtue of the checks in * get_rte_attribute_type(), which is called by make_var(). That routine - * has to do a cache lookup anyway, so the check there is cheap. + * has to do a cache lookup anyway, so the check there is cheap. Callers + * interested in finding match with shortest distance need to defend + * against this directly, though. */ foreach(c, rte->eref->colnames) { + const char *attcolname = strVal(lfirst(c)); + attnum++; - if (strcmp(strVal(lfirst(c)), colname) == 0) + if (strcmp(attcolname, colname) == 0) { if (result) ereport(ERROR, @@ -566,6 +601,39 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname, markVarForSelectPriv(pstate, var, rte); result = (Node *) var; } + + if (distance && *distance != 0) + { + if (result) + { + /* Exact match just found */ + *distance = 0; + } + else + { + int lowestdistance = *distance; + int thisdistance = distanceName(attcolname, colname, + lowestdistance); + + if (thisdistance >= lowestdistance) + { + /* + * This match distance may equal a prior match within this + * same range table. When that happens, the prior match is + * discarded as worthless, since a single best match is + * required within a RTE. + */ + if (thisdistance == lowestdistance) + *closest = InvalidAttrNumber; + + continue; + } + + /* Store new lowest observed distance for RT */ + *distance = thisdistance; + } + *closest = attnum; + } } /* @@ -642,7 +710,8 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, continue; /* use orig_pstate here to get the right sublevels_up */ - newresult = scanRTEForColumn(orig_pstate, rte, colname, location); + newresult = scanRTEForColumn(orig_pstate, rte, colname, location, + NULL, NULL); if (newresult) { @@ -668,8 +737,14 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, /* * searchRangeTableForCol - * See if any RangeTblEntry could possibly provide the given column name. - * If so, return a pointer to the RangeTblEntry; else return NULL. + * See if any RangeTblEntry could possibly provide the given column name (or + * find the best match available). Returns a list of equally likely + * candidates, or NIL in the event of no plausible candidate. + * + * Column name may be matched fuzzily; we provide the closet columns if there + * was not an exact match. Caller can depend on passed closest array to find + * right attribute within corresponding (first and second) returned list RTEs. + * If closest attributes are InvalidAttrNumber, that indicates an exact match. * * This is different from colNameToVar in that it considers every entry in * the ParseState's rangetable(s), not only those that are currently visible @@ -678,26 +753,145 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly, * matches, but only one will be returned). This must be used ONLY as a * heuristic in giving suitable error messages. See errorMissingColumn. */ -static RangeTblEntry * -searchRangeTableForCol(ParseState *pstate, char *colname, int location) +static List * +searchRangeTableForCol(ParseState *pstate, const char *alias, char *colname, + int location, AttrNumber closest[2]) { - ParseState *orig_pstate = pstate; + ParseState *orig_pstate = pstate; + int distance = INT_MAX; + List *matchedrte = NIL; + ListCell *l; + int i; while (pstate != NULL) { - ListCell *l; - foreach(l, pstate->p_rtable) { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + AttrNumber rteclosest = InvalidAttrNumber; + int rtdistance = INT_MAX; + bool wrongalias; - if (scanRTEForColumn(orig_pstate, rte, colname, location)) - return rte; + /* + * Get single best match from each RTE, or no match for RTE if + * there is a tie for best match within a given RTE + */ + scanRTEForColumn(orig_pstate, rte, colname, location, &rteclosest, + &rtdistance); + + /* Was alias provided by user that does not match entry's alias? */ + wrongalias = (alias && strcmp(alias, rte->eref->aliasname) != 0); + + if (rtdistance == 0) + { + /* Exact match (for "wrong alias" or "wrong level" cases) */ + closest[0] = wrongalias? rteclosest : InvalidAttrNumber; + + /* + * Any exact match is always the uncontested best match. It + * doesn't seem worth considering the case where there are + * multiple exact matches, so we're done. + */ + matchedrte = lappend(NIL, rte); + return matchedrte; + } + + /* + * Charge extra (for inexact matches only) when an alias was + * specified that differs from what might have been used to + * correctly qualify this RTE's closest column + */ + if (wrongalias) + rtdistance += 3; + + if (rteclosest != InvalidAttrNumber) + { + if (rtdistance >= distance) + { + /* + * Perhaps record this attribute as being just as close in + * distance to closest attribute observed so far across + * entire range table. Iff this distance is ultimately the + * lowest distance observed overall, it may end up as the + * second match. + */ + if (rtdistance == distance) + { + closest[1] = rteclosest; + matchedrte = lappend(matchedrte, rte); + } + + continue; + } + + /* + * One best match (better than any others in previous RTEs) was + * found within this RTE + */ + distance = rtdistance; + /* New uncontested best match */ + matchedrte = lappend(NIL, rte); + closest[0] = rteclosest; + } + else + { + /* + * Even though there were perhaps multiple joint-best matches + * within this RTE (implying that there can be no attribute + * suggestion from it), the shortest distance should still + * serve as the distance for later RTEs to beat (but naturally + * only if it happens to be the lowest so far across the entire + * range table). + */ + distance = Min(distance, rtdistance); + } } pstate = pstate->parentParseState; } - return NULL; + + /* + * Too many equally close partial matches found? + * + * It's useful to provide two matches for the common case where two range + * tables each have one equally distant candidate column, as when an + * unqualified (and therefore would-be ambiguous) column name is specified + * which is also misspelled by the user. It seems unhelpful to show no + * hint when this occurs, since in practice one attribute probably + * references the other in a foreign key relationship. However, when there + * are more than 2 range tables with equally distant matches that's + * probably because the matches are not useful, so don't suggest anything. + */ + if (list_length(matchedrte) > 2) + return NIL; + + /* + * Handle dropped columns, which can appear here as empty colnames per + * remarks within scanRTEForColumn(). If either the first or second + * suggested attributes are dropped, do not provide any suggestion. + */ + i = 0; + foreach(l, matchedrte) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); + char *closestcol; + + closestcol = strVal(list_nth(rte->eref->colnames, closest[i++] - 1)); + + if (strcmp(closestcol, "") == 0) + return NIL; + } + + /* + * Distance must be less than a normalized threshold in order to avoid + * completely ludicrous suggestions. Note that a distance of 6 will be + * seen when 6 deletions are required against actual attribute name, or 3 + * insertions/substitutions. + */ + if (distance > 6 && distance > strlen(colname) / 2) + return NIL; + + return matchedrte; } /* @@ -2856,40 +3050,95 @@ errorMissingRTE(ParseState *pstate, RangeVar *relation) * Generate a suitable error about a missing column. * * Since this is a very common type of error, we work rather hard to - * produce a helpful message. + * produce a helpful message, going so far as to guess user's intent + * when a missing column name is probably intended to reference one of + * two would-be ambiguous attributes (when no alias/qualification was + * provided). */ void errorMissingColumn(ParseState *pstate, char *relname, char *colname, int location) { - RangeTblEntry *rte; + List *matchedrte; + AttrNumber closest[2]; + RangeTblEntry *rte1 = NULL, + *rte2 = NULL; + char *closestcol1 = NULL; + char *closestcol2 = NULL; /* - * If relname was given, just play dumb and report it. (In practice, a - * bad qualification name should end up at errorMissingRTE, not here, so - * no need to work hard on this case.) + * closest[0] will remain InvalidAttrNumber in event of exact match, and in + * the event of an exact match there is only ever one suggestion */ - if (relname) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column %s.%s does not exist", relname, colname), - parser_errposition(pstate, location))); + closest[0] = closest[1] = InvalidAttrNumber; /* - * Otherwise, search the entire rtable looking for possible matches. If - * we find one, emit a hint about it. + * Search the entire rtable looking for possible matches. If we find one, + * emit a hint about it. * * TODO: improve this code (and also errorMissingRTE) to mention using * LATERAL if appropriate. */ - rte = searchRangeTableForCol(pstate, colname, location); - - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column \"%s\" does not exist", colname), - rte ? errhint("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query.", - colname, rte->eref->aliasname) : 0, - parser_errposition(pstate, location))); + matchedrte = searchRangeTableForCol(pstate, relname, colname, location, + closest); + + /* + * In practice a bad qualification name should end up at errorMissingRTE, + * not here, so no need to work hard on this case. + * + * Extract RTEs for best match, if any, and joint best match, if any. + */ + if (matchedrte) + { + rte1 = (RangeTblEntry *) lfirst(list_head(matchedrte)); + + if (list_length(matchedrte) > 1) + rte2 = (RangeTblEntry *) lsecond(matchedrte); + + if (rte1 && closest[0] != InvalidAttrNumber) + closestcol1 = strVal(list_nth(rte1->eref->colnames, closest[0] - 1)); + + if (rte2 && closest[1] != InvalidAttrNumber) + closestcol2 = strVal(list_nth(rte2->eref->colnames, closest[1] - 1)); + } + + if (!rte2) + { + /* + * Handle case where there is zero or one column suggestions to hint, + * including exact matches referenced but not visible. + * + * Infer an exact match referenced despite not being visible from the + * fact that an attribute number was not passed back. + */ + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + relname? + errmsg("column %s.%s does not exist", relname, colname): + errmsg("column \"%s\" does not exist", colname), + rte1? closest[0] != InvalidAttrNumber? + errhint("Perhaps you meant to reference the column \"%s\".\"%s\".", + rte1->eref->aliasname, closestcol1): + errhint("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query.", + colname, rte1->eref->aliasname): 0, + parser_errposition(pstate, location))); + } + else + { + /* + * Handle case where there are two equally useful column hints, each + * from a different RTE + */ + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + relname? + errmsg("column %s.%s does not exist", relname, colname): + errmsg("column \"%s\" does not exist", colname), + errhint("Perhaps you meant to reference the column \"%s\".\"%s\" or the column \"%s\".\"%s\".", + rte1->eref->aliasname, closestcol1, + rte2->eref->aliasname, closestcol2), + parser_errposition(pstate, location))); + } } diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 7b4391b..3ea9bf4 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -38,4 +38,6 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \ like.o: like.c like_match.c +varlena.o: varlena.c levenshtein.c + include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c new file mode 100644 index 0000000..bb8b7bf --- /dev/null +++ b/src/backend/utils/adt/levenshtein.c @@ -0,0 +1,393 @@ +/*------------------------------------------------------------------------- + * + * levenshtein.c + * Levenshtein distance implementation. + * + * Original author: Joe Conway <m...@joeconway.com> + * + * This file is included by varlena.c twice, to provide matching code for (1) + * Levenshtein distance with custom costings, and (2) Levenshtein distance with + * custom costings and a "max" value above which exact distances are not + * interesting. Before the inclusion, we rely on the presence of the inline + * function rest_of_char_same(). + * + * Written based on a description of the algorithm by Michael Gilleland found + * at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the + * PHP 4.0.6 distribution for inspiration. Configurable penalty costs + * extension is introduced by Volkan YAZICI <volkan.yaz...@gmail.com. + * + * Copyright (c) 2001-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/adt/levenshtein.c + * + *------------------------------------------------------------------------- + */ +#define MAX_LEVENSHTEIN_STRLEN 255 + +/* + * Calculates Levenshtein distance metric between supplied csrings, which are + * not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices + * for common cases, but your mileage may vary. + * + * One way to compute Levenshtein distance is to incrementally construct + * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number + * of operations required to transform the first i characters of s into + * the first j characters of t. The last column of the final row is the + * answer. + * + * We use that algorithm here with some modification. In lieu of holding + * the entire array in memory at once, we'll just use two arrays of size + * m+1 for storing accumulated values. At each step one array represents + * the "previous" row and one is the "current" row of the notional large + * array. + * + * If max_d >= 0, we only need to provide an accurate answer when that answer + * is less than or equal to the bound. From any cell in the matrix, there is + * theoretical "minimum residual distance" from that cell to the last column + * of the final row. This minimum residual distance is zero when the + * untransformed portions of the strings are of equal length (because we might + * get lucky and find all the remaining characters matching) and is otherwise + * based on the minimum number of insertions or deletions needed to make them + * equal length. The residual distance grows as we move toward the upper + * right or lower left corners of the matrix. When the max_d bound is + * usefully tight, we can use this property to avoid computing the entirety + * of each row; instead, we maintain a start_column and stop_column that + * identify the portion of the matrix close to the diagonal which can still + * affect the final answer. + */ +int +#ifdef LEVENSHTEIN_LESS_EQUAL +varstr_leven_less_equal(const char *source, int slen, const char *target, + int tlen, int ins_c, int del_c, int sub_c, int max_d) +#else +varstr_leven(const char *source, int slen, const char *target, int tlen, + int ins_c, int del_c, int sub_c) +#endif +{ + int m, + n; + int *prev; + int *curr; + int *s_char_len = NULL; + int i, + j; + const char *y; + + /* + * For varstr_levenshtein_less_equal, we have real variables called + * start_column and stop_column; otherwise it's just short-hand for 0 and + * m. + */ +#ifdef LEVENSHTEIN_LESS_EQUAL + int start_column, + stop_column; + +#undef START_COLUMN +#undef STOP_COLUMN +#define START_COLUMN start_column +#define STOP_COLUMN stop_column +#else +#undef START_COLUMN +#undef STOP_COLUMN +#define START_COLUMN 0 +#define STOP_COLUMN m +#endif + + m = pg_mbstrlen_with_len(source, slen); + n = pg_mbstrlen_with_len(target, tlen); + + /* + * We can transform an empty s into t with n insertions, or a non-empty t + * into an empty s with m deletions. + */ + if (!m) + return n * ins_c; + if (!n) + return m * del_c; + + /* + * A common use for Levenshtein distance is to match column names. + * Therefore, restrict the size of MAX_LEVENSHTEIN_STRLEN such that this is + * guaranteed to work. + */ + StaticAssertStmt(NAMEDATALEN <= MAX_LEVENSHTEIN_STRLEN, + "Levenshtein hinting mechanism restricts NAMEDATALEN"); + + /* + * For security concerns, restrict excessive CPU+RAM usage. (This + * implementation uses O(m) memory and has O(mn) complexity.) + */ + if (m > MAX_LEVENSHTEIN_STRLEN || + n > MAX_LEVENSHTEIN_STRLEN) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("argument exceeds the maximum length of %d bytes", + MAX_LEVENSHTEIN_STRLEN))); + +#ifdef LEVENSHTEIN_LESS_EQUAL + /* Initialize start and stop columns. */ + start_column = 0; + stop_column = m + 1; + + /* + * If max_d >= 0, determine whether the bound is impossibly tight. If so, + * return max_d + 1 immediately. Otherwise, determine whether it's tight + * enough to limit the computation we must perform. If so, figure out + * initial stop column. + */ + if (max_d >= 0) + { + int min_theo_d; /* Theoretical minimum distance. */ + int max_theo_d; /* Theoretical maximum distance. */ + int net_inserts = n - m; + + min_theo_d = net_inserts < 0 ? + -net_inserts * del_c : net_inserts * ins_c; + if (min_theo_d > max_d) + return max_d + 1; + if (ins_c + del_c < sub_c) + sub_c = ins_c + del_c; + max_theo_d = min_theo_d + sub_c * Min(m, n); + if (max_d >= max_theo_d) + max_d = -1; + else if (ins_c + del_c > 0) + { + /* + * Figure out how much of the first row of the notional matrix we + * need to fill in. If the string is growing, the theoretical + * minimum distance already incorporates the cost of deleting the + * number of characters necessary to make the two strings equal in + * length. Each additional deletion forces another insertion, so + * the best-case total cost increases by ins_c + del_c. If the + * string is shrinking, the minimum theoretical cost assumes no + * excess deletions; that is, we're starting no further right than + * column n - m. If we do start further right, the best-case + * total cost increases by ins_c + del_c for each move right. + */ + int slack_d = max_d - min_theo_d; + int best_column = net_inserts < 0 ? -net_inserts : 0; + + stop_column = best_column + (slack_d / (ins_c + del_c)) + 1; + if (stop_column > m) + stop_column = m + 1; + } + } +#endif + + /* + * In order to avoid calling pg_mblen() repeatedly on each character in s, + * we cache all the lengths before starting the main loop -- but if all + * the characters in both strings are single byte, then we skip this and + * use a fast-path in the main loop. If only one string contains + * multi-byte characters, we still build the array, so that the fast-path + * needn't deal with the case where the array hasn't been initialized. + */ + if (m != slen || n != tlen) + { + int i; + const char *cp = source; + + s_char_len = (int *) palloc((m + 1) * sizeof(int)); + for (i = 0; i < m; ++i) + { + s_char_len[i] = pg_mblen(cp); + cp += s_char_len[i]; + } + s_char_len[i] = 0; + } + + /* One more cell for initialization column and row. */ + ++m; + ++n; + + /* Previous and current rows of notional array. */ + prev = (int *) palloc(2 * m * sizeof(int)); + curr = prev + m; + + /* + * To transform the first i characters of s into the first 0 characters of + * t, we must perform i deletions. + */ + for (i = START_COLUMN; i < STOP_COLUMN; i++) + prev[i] = i * del_c; + + /* Loop through rows of the notional array */ + for (y = target, j = 1; j < n; j++) + { + int *temp; + const char *x = source; + int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; + +#ifdef LEVENSHTEIN_LESS_EQUAL + + /* + * In the best case, values percolate down the diagonal unchanged, so + * we must increment stop_column unless it's already on the right end + * of the array. The inner loop will read prev[stop_column], so we + * have to initialize it even though it shouldn't affect the result. + */ + if (stop_column < m) + { + prev[stop_column] = max_d + 1; + ++stop_column; + } + + /* + * The main loop fills in curr, but curr[0] needs a special case: to + * transform the first 0 characters of s into the first j characters + * of t, we must perform j insertions. However, if start_column > 0, + * this special case does not apply. + */ + if (start_column == 0) + { + curr[0] = j * ins_c; + i = 1; + } + else + i = start_column; +#else + curr[0] = j * ins_c; + i = 1; +#endif + + /* + * This inner loop is critical to performance, so we include a + * fast-path to handle the (fairly common) case where no multibyte + * characters are in the mix. The fast-path is entitled to assume + * that if s_char_len is not initialized then BOTH strings contain + * only single-byte characters. + */ + if (s_char_len != NULL) + { + for (; i < STOP_COLUMN; i++) + { + int ins; + int del; + int sub; + int x_char_len = s_char_len[i - 1]; + + /* + * Calculate costs for insertion, deletion, and substitution. + * + * When calculating cost for substitution, we compare the last + * character of each possibly-multibyte character first, + * because that's enough to rule out most mis-matches. If we + * get past that test, then we compare the lengths and the + * remaining bytes. + */ + ins = prev[i] + ins_c; + del = curr[i - 1] + del_c; + if (x[x_char_len - 1] == y[y_char_len - 1] + && x_char_len == y_char_len && + (x_char_len == 1 || rest_of_char_same(x, y, x_char_len))) + sub = prev[i - 1]; + else + sub = prev[i - 1] + sub_c; + + /* Take the one with minimum cost. */ + curr[i] = Min(ins, del); + curr[i] = Min(curr[i], sub); + + /* Point to next character. */ + x += x_char_len; + } + } + else + { + for (; i < STOP_COLUMN; i++) + { + int ins; + int del; + int sub; + + /* Calculate costs for insertion, deletion, and substitution. */ + ins = prev[i] + ins_c; + del = curr[i - 1] + del_c; + sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c); + + /* Take the one with minimum cost. */ + curr[i] = Min(ins, del); + curr[i] = Min(curr[i], sub); + + /* Point to next character. */ + x++; + } + } + + /* Swap current row with previous row. */ + temp = curr; + curr = prev; + prev = temp; + + /* Point to next character. */ + y += y_char_len; + +#ifdef LEVENSHTEIN_LESS_EQUAL + + /* + * This chunk of code represents a significant performance hit if used + * in the case where there is no max_d bound. This is probably not + * because the max_d >= 0 test itself is expensive, but rather because + * the possibility of needing to execute this code prevents tight + * optimization of the loop as a whole. + */ + if (max_d >= 0) + { + /* + * The "zero point" is the column of the current row where the + * remaining portions of the strings are of equal length. There + * are (n - 1) characters in the target string, of which j have + * been transformed. There are (m - 1) characters in the source + * string, so we want to find the value for zp where (n - 1) - j = + * (m - 1) - zp. + */ + int zp = j - (n - m); + + /* Check whether the stop column can slide left. */ + while (stop_column > 0) + { + int ii = stop_column - 1; + int net_inserts = ii - zp; + + if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c : + -net_inserts * del_c) <= max_d) + break; + stop_column--; + } + + /* Check whether the start column can slide right. */ + while (start_column < stop_column) + { + int net_inserts = start_column - zp; + + if (prev[start_column] + + (net_inserts > 0 ? net_inserts * ins_c : + -net_inserts * del_c) <= max_d) + break; + + /* + * We'll never again update these values, so we must make sure + * there's nothing here that could confuse any future + * iteration of the outer loop. + */ + prev[start_column] = max_d + 1; + curr[start_column] = max_d + 1; + if (start_column != 0) + source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; + start_column++; + } + + /* If they cross, we're going to exceed the bound. */ + if (start_column >= stop_column) + return max_d + 1; + } +#endif + } + + /* + * Because the final value was swapped from the previous row to the + * current row, that's where we'll find it. + */ + return prev[m - 1]; +} diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index c3171b5..4b9e62a 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1546,6 +1546,31 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) return result; } +/* + * varstr_leven() + * varstr_leven_less_equal() + * Levenshtein distance functions. All arguments should be strlen(s) <= 255. + * Guaranteed to work with Name datatype's cstrings. + * For full details see levenshtein.c. + * + * Helper function. Faster than memcmp(), for this use case. + */ +static inline bool +rest_of_char_same(const char *s1, const char *s2, int len) +{ + while (len > 0) + { + len--; + if (s1[len] != s2[len]) + return false; + } + return true; +} +/* Expand each Levenshtein distance variant */ +#include "levenshtein.c" +#define LEVENSHTEIN_LESS_EQUAL +#include "levenshtein.c" +#undef LEVENSHTEIN_LESS_EQUAL /* text_cmp() * Internal comparison function for text strings. diff --git a/src/include/parser/parse_relation.h b/src/include/parser/parse_relation.h index d8b9493..c18157a 100644 --- a/src/include/parser/parse_relation.h +++ b/src/include/parser/parse_relation.h @@ -35,7 +35,8 @@ extern RangeTblEntry *GetRTEByRangeTablePosn(ParseState *pstate, extern CommonTableExpr *GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup); extern Node *scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, - char *colname, int location); + char *colname, int location, AttrNumber *matchedatt, + int *distance); extern Node *colNameToVar(ParseState *pstate, char *colname, bool localonly, int location); extern void markVarForSelectPriv(ParseState *pstate, Var *var, diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 4e74d85..0abe9bf 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -786,6 +786,11 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS); extern Datum name_text(PG_FUNCTION_ARGS); extern Datum text_name(PG_FUNCTION_ARGS); extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid); +extern int varstr_leven(const char *source, int slen, const char *target, + int tlen, int ins_c, int del_c, int sub_c); +extern int varstr_leven_less_equal(const char *source, int slen, + const char *target, int tlen, int ins_c, + int del_c, int sub_c, int max_d); extern List *textToQualifiedNameList(text *textval); extern bool SplitIdentifierString(char *rawstring, char separator, List **namelist); diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index d233710..b24fa43 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -536,6 +536,7 @@ create table atacc1 ( test int ); -- add a check constraint (fails) alter table atacc1 add constraint atacc_test1 check (test1>3); ERROR: column "test1" does not exist +HINT: Perhaps you meant to reference the column "atacc1"."test". drop table atacc1; -- something a little more complicated create table atacc1 ( test int, test2 int, test3 int); @@ -1342,6 +1343,7 @@ select f1 from c1; ERROR: column "f1" does not exist LINE 1: select f1 from c1; ^ +HINT: Perhaps you meant to reference the column "c1"."f2". drop table p1 cascade; NOTICE: drop cascades to table c1 create table p1 (f1 int, f2 int); @@ -1355,6 +1357,7 @@ select f1 from c1; ERROR: column "f1" does not exist LINE 1: select f1 from c1; ^ +HINT: Perhaps you meant to reference the column "c1"."f2". drop table p1 cascade; NOTICE: drop cascades to table c1 create table p1 (f1 int, f2 int); @@ -1479,6 +1482,7 @@ select oid > 0, * from altstartwith; -- fails ERROR: column "oid" does not exist LINE 1: select oid > 0, * from altstartwith; ^ +HINT: Perhaps you meant to reference the column "altstartwith"."col". select * from altstartwith; col ----- @@ -1515,10 +1519,12 @@ select oid > 0, * from altwithoid; -- fails ERROR: column "oid" does not exist LINE 1: select oid > 0, * from altwithoid; ^ +HINT: Perhaps you meant to reference the column "altwithoid"."col". select oid > 0, * from altinhoid; -- fails ERROR: column "oid" does not exist LINE 1: select oid > 0, * from altinhoid; ^ +HINT: Perhaps you meant to reference the column "altinhoid"."col". select * from altwithoid; col ----- @@ -1554,6 +1560,7 @@ select oid > 0, * from altwithoid; -- fails ERROR: column "oid" does not exist LINE 1: select oid > 0, * from altwithoid; ^ +HINT: Perhaps you meant to reference the column "altwithoid"."col". select oid > 0, * from altinhoid; ?column? | col ----------+----- @@ -1580,6 +1587,7 @@ select oid > 0, * from altwithoid; -- fails ERROR: column "oid" does not exist LINE 1: select oid > 0, * from altwithoid; ^ +HINT: Perhaps you meant to reference the column "altwithoid"."col". select oid > 0, * from altinhoid; ?column? | col ----------+----- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 2501184..3ef5580 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -2222,6 +2222,12 @@ select * from t1 left join t2 on (t1.a = t2.a); 200 | 1000 | 200 | 2001 (5 rows) +-- Test matching of column name with wrong alias +select t1.x from t1 join t3 on (t1.a = t3.x); +ERROR: column t1.x does not exist +LINE 1: select t1.x from t1 join t3 on (t1.a = t3.x); + ^ +HINT: Perhaps you meant to reference the column "t3"."x". -- -- regression test for 8.1 merge right join bug -- @@ -3415,6 +3421,39 @@ select * from (0 rows) -- +-- Test hints given on incorrect column references are useful +-- +select t1.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestipn +ERROR: column t1.uunique1 does not exist +LINE 1: select t1.uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t1"."unique1". +select t2.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion +ERROR: column t2.uunique1 does not exist +LINE 1: select t2.uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t2"."unique1". +select uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once +ERROR: column "uunique1" does not exist +LINE 1: select uunique1 from + ^ +HINT: Perhaps you meant to reference the column "t1"."unique1" or the column "t2"."unique1". +-- +-- Take care to reference the correct RTE +-- +select atts.relid::regclass, s.* from pg_stats s join + pg_attribute a on s.attname = a.attname and s.tablename = + a.attrelid::regclass::text join (select unnest(indkey) attnum, + indexrelid from pg_index i) atts on atts.attnum = a.attnum where + schemaname != 'pg_catalog'; +ERROR: column atts.relid does not exist +LINE 1: select atts.relid::regclass, s.* from pg_stats s join + ^ +HINT: Perhaps you meant to reference the column "atts"."indexrelid". +-- -- Test LATERAL -- select unique2, x.* diff --git a/src/test/regress/expected/plpgsql.out b/src/test/regress/expected/plpgsql.out index 983f1b8..fb4abe6 100644 --- a/src/test/regress/expected/plpgsql.out +++ b/src/test/regress/expected/plpgsql.out @@ -4782,6 +4782,7 @@ END$$; ERROR: column "foo" does not exist LINE 1: SELECT rtrim(roomno) AS roomno, foo FROM Room ORDER BY roomn... ^ +HINT: Perhaps you meant to reference the column "room"."roomno". QUERY: SELECT rtrim(roomno) AS roomno, foo FROM Room ORDER BY roomno CONTEXT: PL/pgSQL function inline_code_block line 4 at FOR over SELECT rows -- Check handling of errors thrown from/into anonymous code blocks. diff --git a/src/test/regress/expected/rowtypes.out b/src/test/regress/expected/rowtypes.out index 88e7bfa..19a6e98 100644 --- a/src/test/regress/expected/rowtypes.out +++ b/src/test/regress/expected/rowtypes.out @@ -452,6 +452,7 @@ select fullname.text from fullname; -- error ERROR: column fullname.text does not exist LINE 1: select fullname.text from fullname; ^ +HINT: Perhaps you meant to reference the column "fullname"."last". -- same, but RECORD instead of named composite type: select cast (row('Jim', 'Beam') as text); row diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index c79b45c..01c80af 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2396,6 +2396,7 @@ select xmin, * from fooview; -- fail, views don't have such a column ERROR: column "xmin" does not exist LINE 1: select xmin, * from fooview; ^ +HINT: Perhaps you meant to reference the column "fooview"."x". select reltoastrelid, relkind, relfrozenxid from pg_class where oid = 'fooview'::regclass; reltoastrelid | relkind | relfrozenxid diff --git a/src/test/regress/expected/without_oid.out b/src/test/regress/expected/without_oid.out index cb2c0c0..fbff011 100644 --- a/src/test/regress/expected/without_oid.out +++ b/src/test/regress/expected/without_oid.out @@ -46,6 +46,7 @@ SELECT count(oid) FROM wo; ERROR: column "oid" does not exist LINE 1: SELECT count(oid) FROM wo; ^ +HINT: Perhaps you meant to reference the column "wo"."i". VACUUM ANALYZE wi; VACUUM ANALYZE wo; SELECT min(relpages) < max(relpages), min(reltuples) - max(reltuples) diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 718e1d9..ca7f966 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -397,6 +397,10 @@ insert into t2a values (200, 2001); select * from t1 left join t2 on (t1.a = t2.a); +-- Test matching of column name with wrong alias + +select t1.x from t1 join t3 on (t1.a = t3.x); + -- -- regression test for 8.1 merge right join bug -- @@ -1051,6 +1055,26 @@ select * from int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1; -- ok -- +-- Test hints given on incorrect column references are useful +-- + +select t1.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestipn +select t2.uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion +select uunique1 from + tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once + +-- +-- Take care to reference the correct RTE +-- + +select atts.relid::regclass, s.* from pg_stats s join + pg_attribute a on s.attname = a.attname and s.tablename = + a.attrelid::regclass::text join (select unnest(indkey) attnum, + indexrelid from pg_index i) atts on atts.attnum = a.attnum where + schemaname != 'pg_catalog'; +-- -- Test LATERAL -- -- 1.9.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers