Re: [HACKERS] proposal: function parse_ident

Jim Nasby Tue, 16 Feb 2016 16:39:06 -0800

On 2/11/16 1:27 AM, Pavel Stehule wrote:

    I editorialized the docs and some of the comments. In particular, I
    documented behavior of not truncating, and recommended casting to
    name[] if user cares about that. Added a unit test to verify that
    works. BTW, I saw mention in the thread about not truncated spaces,
    but the function *does* truncate them, unless they're inside quotes,
    where they're legitimate.

ok


I missed some of my edits. Updated patch with those in place attached.

    Also added test for invalid characters.

    I think "strict" would be more in line with other uses in code.
    There are currently no other occurrences of 'strictmode' in the
    code. There are loads of references to 'strict', but I didn't go
    through all of them to see if any were used as externally visible
    function parameter names.


I am sorry, I don't understand to this point. You unlike the name of
parameter "strictmode" ? Have you any proposal? Maybe "restrictive" ?


I would just call it strict. There's precedent for that in the code.

The almost all code +/- is related to human readable error messages. We
can move some code to separate static functions - read_quoted_ident,
read_unquoted_ident, but there will be some magic about parameters, and
the code will not be much better, than it is now.

What I'm saying is that most places that need to do de-quoting orsimilar just run a simple while loop and use an in_quote variable totrack whether they're inside a quote or not. Seebackend/utils/adt/rowtypes.c line 199 for an example.

As I said, I don't have a strong opinion on it, so if you prefer it thisway that's fine with me.

--
Jim Nasby, Data Architect, Blue Treble Consulting, Austin TX
Experts in Analytics, Data Architecture and PostgreSQL
Data in Trouble? Get it in Treble! http://BlueTreble.com

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 139aa2b..b4a2898 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1778,6 +1778,27 @@
       <row>
        <entry>
         <indexterm>
+         <primary>parse_ident</primary>
+        </indexterm>
+        <literal><function>parse_ident(<parameter>str</parameter> 
<type>text</type>,
+           [ <parameter>strictmode</parameter> <type>boolean</type> DEFAULT 
true ] )</function></literal>
+       </entry>
+       <entry><type>text[]</type></entry>
+       <entry>Split <parameter>qualified identifier</parameter> into array 
<parameter>parts</parameter>.
+       When <parameter>strictmode</parameter> is false, extra characters after 
the identifier are ignored.
+       This is useful for parsing identifiers for objects like functions and 
arrays that may have trailing
+       characters. By default, extra characters after the last identifier are 
considered an error.
+       second parameter is false, then chars after last identifier are 
ignored. Note that this function
+       does not truncate quoted identifiers. If you care about that you should 
cast the result of this
+          function to name[].
+       </entry>
+       <entry><literal>parse_ident('"SomeSchema".someTable')</literal></entry>
+       <entry><literal>"SomeSchema,sometable"</literal></entry>
+      </row>
+
+      <row>
+       <entry>
+        <indexterm>
          <primary>pg_client_encoding</primary>
         </indexterm>
         <literal><function>pg_client_encoding()</function></literal>
diff --git a/src/backend/catalog/system_views.sql 
b/src/backend/catalog/system_views.sql
index 923fe58..61d5b80 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -965,3 +965,10 @@ RETURNS jsonb
 LANGUAGE INTERNAL
 STRICT IMMUTABLE
 AS 'jsonb_set';
+
+CREATE OR REPLACE FUNCTION
+  parse_ident(str text, strictmode boolean DEFAULT true)
+RETURNS text[]
+LANGUAGE INTERNAL
+STRICT IMMUTABLE
+AS 'parse_ident';
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index 2b4ab20..7aa5b76 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -130,6 +130,15 @@ scanstr(const char *s)
 char *
 downcase_truncate_identifier(const char *ident, int len, bool warn)
 {
+       return downcase_identifier(ident, len, warn, true);
+}
+
+/*
+ * a workhorse for downcase_truncate_identifier
+ */
+char *
+downcase_identifier(const char *ident, int len, bool warn, bool truncate)
+{
        char       *result;
        int                     i;
        bool            enc_is_single_byte;
@@ -158,12 +167,13 @@ downcase_truncate_identifier(const char *ident, int len, 
bool warn)
        }
        result[i] = '\0';
 
-       if (i >= NAMEDATALEN)
+       if (i >= NAMEDATALEN && truncate)
                truncate_identifier(result, i, warn);
 
        return result;
 }
 
+
 /*
  * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
  *
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 6a306f3..f7d60b1 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -21,12 +21,15 @@
 #include <unistd.h>
 
 #include "access/sysattr.h"
+#include "access/htup_details.h"
 #include "catalog/catalog.h"
+#include "catalog/namespace.h"
 #include "catalog/pg_tablespace.h"
 #include "catalog/pg_type.h"
 #include "commands/dbcommands.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "parser/scansup.h"
 #include "parser/keywords.h"
 #include "postmaster/syslogger.h"
 #include "rewrite/rewriteHandler.h"
@@ -38,6 +41,7 @@
 #include "utils/ruleutils.h"
 #include "tcop/tcopprot.h"
 #include "utils/acl.h"
+#include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/timestamp.h"
 
@@ -598,3 +602,174 @@ pg_column_is_updatable(PG_FUNCTION_ARGS)
 
        PG_RETURN_BOOL((events & REQ_EVENTS) == REQ_EVENTS);
 }
+
+
+/*
+ * This simple parser utility are compatible with lexer implementation,
+ * used only in parse_ident function
+ */
+static bool
+is_ident_start(unsigned char c)
+{
+       if (c == '_')
+               return true;
+       if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+               return true;
+
+       if (c >= 0200 && c <= 0377)
+               return true;
+
+       return false;
+}
+
+static bool
+is_ident_cont(unsigned char c)
+{
+       if (c >= '0' && c <= '9')
+               return true;
+
+       return is_ident_start(c);
+}
+
+/*
+ * parse_ident - parse SQL composed identifier to separate identifiers.
+ * When strict mode is active (second parameter), then any chars after
+ * last identifiers are disallowed.
+ */
+Datum
+parse_ident(PG_FUNCTION_ARGS)
+{
+       text            *qualname;
+       char            *qualname_str;
+       bool            strict_mode;
+       ArrayBuildState *astate = NULL;
+       char    *nextp;
+       bool            after_dot = false;
+
+       qualname = PG_GETARG_TEXT_PP(0);
+       qualname_str = text_to_cstring(qualname);
+       strict_mode = PG_GETARG_BOOL(1);
+
+       nextp = qualname_str;
+
+       /* skip leading whitespace */
+       while (isspace((unsigned char) *nextp))
+               nextp++;
+
+       for (;;)
+       {
+               char            *curname;
+               char            *endp;
+               bool            missing_ident;
+
+               missing_ident = true;
+
+               if (*nextp == '\"')
+               {
+                       curname = nextp + 1;
+                       for (;;)
+                       {
+                               endp = strchr(nextp + 1, '\"');
+                               if (endp == NULL)
+                                       ereport(ERROR,
+                                               
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("unclosed double 
quotes"),
+                                                errdetail("string \"%s\" is 
not valid identifier",
+                                                                               
                        qualname_str)));
+                               if (endp[1] != '\"')
+                                       break;
+                               memmove(endp, endp + 1, strlen(endp));
+                               nextp = endp;
+                       }
+                       nextp = endp + 1;
+                       *endp = '\0';
+
+                       if (endp - curname == 0)
+                               ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("identifier should not be 
empty: \"%s\"",
+                                                                               
                qualname_str)));
+
+                       astate = accumArrayResult(astate,
+                               CStringGetTextDatum(curname), false,
+                                                   TEXTOID, 
CurrentMemoryContext);
+                       missing_ident = false;
+               }
+               else
+               {
+                       if (is_ident_start((unsigned char) *nextp))
+                       {
+                               char *downname;
+                               int     len;
+                               text    *part;
+
+                               curname = nextp++;
+                               while (is_ident_cont((unsigned char) *nextp))
+                                       nextp++;
+
+                               len = nextp - curname;
+
+                               /*
+                                * Unlike name, we don't implicitly truncate 
identifiers. This
+                                * is useful for allowing the user to check for 
specific parts
+                                * of the identifier being too long. It's easy 
enough for the
+                                * user to get the truncated names by casting 
our output to
+                                * name[].
+                                */
+                               downname = downcase_identifier(curname, len, 
false, false);
+                               part = cstring_to_text_with_len(downname, len);
+                               astate = accumArrayResult(astate,
+                                       PointerGetDatum(part), false,
+                                                           TEXTOID, 
CurrentMemoryContext);
+                               missing_ident = false;
+                       }
+               }
+
+               if (missing_ident)
+               {
+                       /* Different error messages based on where we failed. */
+                       if (*nextp == '.')
+                               ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("missing identifier before 
\".\" symbol: \"%s\"",
+                                                                               
                qualname_str)));
+                       else if (after_dot)
+                               ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("missing identifier after \".\" 
symbol: \"%s\"",
+                                                                               
                qualname_str)));
+                       else
+                               ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("missing identifier: \"%s\"",
+                                                                               
                qualname_str)));
+               }
+
+               while (isspace((unsigned char) *nextp))
+                       nextp++;
+
+               if (*nextp == '.')
+               {
+                       after_dot = true;
+                       nextp++;
+                       while (isspace((unsigned char) *nextp))
+                               nextp++;
+                       continue;
+               }
+               else if (*nextp == '\0')
+               {
+                       break;
+               }
+               else
+               {
+                       if (strict_mode)
+                               ereport(ERROR,
+                                       
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("identifier contains disallowed 
characters: \"%s\"",
+                                                                               
                qualname_str)));
+                       break;
+               }
+       }
+
+       PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
+}
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 5ded13e..d7c09a7 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -3451,6 +3451,9 @@ DESCR("I/O");
 DATA(insert OID = 4086 (  to_regnamespace      PGNSP PGUID 12 1 0 0 0 f f f f 
t f s s 1 0 4089 "25" _null_ _null_ _null_ _null_ _null_ to_regnamespace _null_ 
_null_ _null_ ));
 DESCR("convert namespace name to regnamespace");
 
+DATA(insert OID = 3318 (  parse_ident          PGNSP PGUID 12 1 0 0 0 f f f f 
t f i s 2 0 1009 "25 16" _null_ _null_ "{str,strictmode}" _null_ _null_ 
parse_ident _null_ _null_ _null_ ));
+DESCR("parse qualified identifier to array of identifiers");
+
 DATA(insert OID = 2246 ( fmgr_internal_validator PGNSP PGUID 12 1 0 0 0 f f f 
f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ 
fmgr_internal_validator _null_ _null_ _null_ ));
 DESCR("(internal)");
 DATA(insert OID = 2247 ( fmgr_c_validator      PGNSP PGUID 12 1 0 0 0 f f f f 
t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_c_validator 
_null_ _null_ _null_ ));
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
index 4f4164b..4f95c81 100644
--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -20,6 +20,9 @@ extern char *scanstr(const char *s);
 extern char *downcase_truncate_identifier(const char *ident, int len,
                                                         bool warn);
 
+extern char *downcase_identifier(const char *ident, int len,
+                                                        bool warn, bool 
truncate);
+
 extern void truncate_identifier(char *ident, int len, bool warn);
 
 extern bool scanner_isspace(char ch);
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 5e8e832..b1b6ef6 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -504,6 +504,7 @@ extern Datum pg_typeof(PG_FUNCTION_ARGS);
 extern Datum pg_collation_for(PG_FUNCTION_ARGS);
 extern Datum pg_relation_is_updatable(PG_FUNCTION_ARGS);
 extern Datum pg_column_is_updatable(PG_FUNCTION_ARGS);
+extern Datum parse_ident(PG_FUNCTION_ARGS);
 
 /* oid.c */
 extern Datum oidin(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/name.out 
b/src/test/regress/expected/name.out
index b359d52..66569f4 100644
--- a/src/test/regress/expected/name.out
+++ b/src/test/regress/expected/name.out
@@ -124,3 +124,55 @@ SELECT '' AS two, c.f1 FROM NAME_TBL c WHERE c.f1 ~ 
'.*asdf.*';
 (2 rows)
 
 DROP TABLE NAME_TBL;
+DO $$
+DECLARE r text[];
+BEGIN
+  r := parse_ident('Schemax.Tabley');
+  RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+  r := parse_ident('"SchemaX"."TableY"');
+  RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+END;
+$$;
+NOTICE:  schemax.tabley
+NOTICE:  "SchemaX"."TableY"
+SELECT parse_ident('foo.boo');
+ parse_ident 
+-------------
+ {foo,boo}
+(1 row)
+
+SELECT parse_ident('foo.boo[]'); -- should fail
+ERROR:  identifier contains disallowed characters: "foo.boo[]"
+SELECT parse_ident('foo.boo[]', strictmode => false); -- ok
+ parse_ident 
+-------------
+ {foo,boo}
+(1 row)
+
+-- should fail
+SELECT parse_ident(' ');
+ERROR:  missing identifier: " "
+SELECT parse_ident(' .aaa');
+ERROR:  missing identifier before "." symbol: " .aaa"
+SELECT parse_ident(' aaa . ');
+ERROR:  missing identifier after "." symbol: " aaa . "
+SELECT parse_ident('aaa.a%b');
+ERROR:  identifier contains disallowed characters: "aaa.a%b"
+SELECT length(a[1]), length(a[2]) from 
parse_ident('"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx".yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy')
 as a ;
+ length | length 
+--------+--------
+    414 |    289
+(1 row)
+
+SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || 
repeat('x',66) || '"');
+                                                parse_ident                    
                            
+-----------------------------------------------------------------------------------------------------------
+ {first,"  second  ","   third   ","  
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
+(1 row)
+
+SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || 
repeat('x',66) || '"')::name[];
+                                             parse_ident                       
                       
+------------------------------------------------------------------------------------------------------
+ {first,"  second  ","   third   ","  
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
+(1 row)
+
diff --git a/src/test/regress/sql/name.sql b/src/test/regress/sql/name.sql
index 1c7a671..2e70e4e 100644
--- a/src/test/regress/sql/name.sql
+++ b/src/test/regress/sql/name.sql
@@ -52,3 +52,28 @@ SELECT '' AS three, c.f1 FROM NAME_TBL c WHERE c.f1 ~ 
'[0-9]';
 SELECT '' AS two, c.f1 FROM NAME_TBL c WHERE c.f1 ~ '.*asdf.*';
 
 DROP TABLE NAME_TBL;
+
+DO $$
+DECLARE r text[];
+BEGIN
+  r := parse_ident('Schemax.Tabley');
+  RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+  r := parse_ident('"SchemaX"."TableY"');
+  RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+END;
+$$;
+
+SELECT parse_ident('foo.boo');
+SELECT parse_ident('foo.boo[]'); -- should fail
+SELECT parse_ident('foo.boo[]', strictmode => false); -- ok
+
+-- should fail
+SELECT parse_ident(' ');
+SELECT parse_ident(' .aaa');
+SELECT parse_ident(' aaa . ');
+SELECT parse_ident('aaa.a%b');
+
+SELECT length(a[1]), length(a[2]) from 
parse_ident('"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx".yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy')
 as a ;
+
+SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || 
repeat('x',66) || '"');
+SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || 
repeat('x',66) || '"')::name[];

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] proposal: function parse_ident

Reply via email to