Re: Do we want a hashset type?

jian he Mon, 26 Jun 2023 04:06:52 -0700

On Mon, Jun 26, 2023 at 4:36 AM Joel Jacobson <[email protected]> wrote:
>
> On Sun, Jun 25, 2023, at 11:42, Joel Jacobson wrote:
> >     SELECT hashset_contains('{}'::int4hashset, NULL::int);
> >
> > would be False, according to the General Rules.
> >
> ...
> > Applying the same rules, we'd have to return Unknown (which we
represent as
> > null) for:
> >
> >     SELECT hashset_contains('{null}'::int4hashset, NULL::int);
> >
>
> Aha! I just discovered to my surprise that the corresponding array
> queries gives the same result:
>
> SELECT NULL = ANY(ARRAY[]::int[]);
>  ?column?
> ----------
>  f
> (1 row)
>
> SELECT NULL = ANY(ARRAY[NULL]::int[]);
>  ?column?
> ----------
>
> (1 row)
>
> I have no more objections; let's stick to the same null semantics as
arrays and multisets.
>
> /Joel


Can you try to glue the attached to the hashset data type input function.
the attached will parse cstring with double quote and not. so '{1,2,3}' ==
'{"1","2","3"}'. obviously quote will preserve the inner string as is.
currently int4hashset input is delimited by comma, if you want deal with
range then you need escape the comma.

/*

gcc -I/home/jian/postgres/2023_05_25_beta5421/include/server -fPIC -c /home/jian/Desktop/regress_pgsql/input_validate.c
gcc -shared  -o /home/jian/Desktop/regress_pgsql/input_validate.so /home/jian/Desktop/regress_pgsql/input_validate.o

CREATE OR REPLACE FUNCTION str_delim_count_validate(cstring) RETURNS BOOL SET search_path from current
        AS '/home/jian/Desktop/regress_pgsql/input_validate', 'str_delim_count_validate'
        LANGUAGE C IMMUTABLE;

select str_delim_count_validate('{"23890","2","3",  "a",1,2,3,4,NULL,2022-01-01,"[1,2]"}');
select str_delim_count_validate('{"3 ", }'); --fail
select str_delim_count_validate('{"3 " }'); --ok
select str_delim_count_validate('{"""23890"}'); --fail.
select str_delim_count_validate('{}'); --ok
select str_delim_count_validate('}');   --fail.
select str_delim_count_validate('{');      --fail.
select str_delim_count_validate('{{}}');      --fail.
select str_delim_count_validate('{{}}');      --fail.
select str_delim_count_validate('{"22022-01-01,[1,2]}'); --fail.
select str_delim_count_validate('{" 2022-01-01 "}'); --ok
select str_delim_count_validate('{ 2022-01-01            }'); --ok
select str_delim_count_validate('{  2022-01-01  ,"[1,2]"}      '); --ok
select str_delim_count_validate('{ 2023-06-26 16:45:02.454293+08       ,"2","3"}'); --ok.
select str_delim_count_validate('{"\\t"}');    --ok
*/
#include "postgres.h"
#include "access/htup_details.h"
#include "catalog/pg_type.h"
#include "utils/builtins.h"
#include "utils/numeric.h"
#include "funcapi.h"
#include "utils/lsyscache.h"
#include "utils/fmgrprotos.h"
#include "common/hashfn.h"
PG_MODULE_MAGIC;

PG_FUNCTION_INFO_V1(str_delim_count_validate);
static	int SetCount(const char *str, char typdelim, Node *escontext);
static bool ReadSetStr(char *arrayStr,const char *origStr, char typdelim, Node *escontext); 
static bool set_isspace(char ch);

Datum
str_delim_count_validate(PG_FUNCTION_ARGS)
{   
    char    *string    = PG_GETARG_CSTRING(0);
    char    *string_save;
    char    *p;
	/* Make a modifiable copy of the input */
	string_save = pstrdup(string);
    char    typdelim   = ',';
	p = string_save;
    int nitems;
    nitems  = SetCount(p,typdelim, fcinfo->context);
    
    if (!ReadSetStr(p, string,typdelim,fcinfo->context))
        elog(INFO,"delimuite str failed");

    elog(INFO,"line %d nitems=%d",__LINE__,nitems);
    PG_RETURN_BOOL(true);
}


/*
 * array_isspace() --- a non-locale-dependent isspace()
 *
 * We used to use isspace() for parsing array values, but that has
 * undesirable results: an array value might be silently interpreted
 * differently depending on the locale setting.  Now we just hard-wire
 * the traditional ASCII definition of isspace().
 */
static bool
set_isspace(char ch)
{
	if (ch == ' ' ||
		ch == '\t' ||
		ch == '\n' ||
		ch == '\r' ||
		ch == '\v' ||
		ch == '\f')
		return true;
	return false;
}

static bool
ReadSetStr(char *arrayStr,const char *origStr, char typdelim, Node *escontext)                    
{
	int		i;
    char    *srcptr;
    bool    in_quotes   = false;
    bool    eoArray     = false;
    bool    hasnull;
    int32   totbytes;
    int     indx	= 0;

	/*
	 * We have to remove " and \ characters to create a clean item value to
	 * pass to the datatype input routine.  We overwrite each item value
	 * in-place within arrayStr to do this.  srcptr is the current scan point,
	 * and dstptr is where we are copying to.
	 *
	 * We also want to suppress leading and trailing unquoted whitespace. We
	 * use the leadingspace flag to suppress leading space.  Trailing space is
	 * tracked by using dstendptr to point to the last significant output
	 * character.
	 *
	 * The error checking in this routine is mostly pro-forma, since we expect
	 * that SetCount() already validated the string.  So we don't bother
	 * with errdetail messages.
	 */
    srcptr  = arrayStr;
    while (!eoArray)
    {
        bool    itemdone = false;
        bool    leadingspace = true;
        bool    hasquoting  = false;
        char    *itemstart;
        char    *dstptr;
        char    *dstendptr;

        itemstart = dstptr = dstendptr = srcptr;

        while (!itemdone)
        {
            switch(*srcptr)
            {
                case '\0':
					/* Signal a premature end of the string */
					ereturn(escontext, false,
							(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
							 errmsg("malformed array literal: \"%s\"",
									origStr)));
                    break;
                case '\\':
                    /* Skip backslash, copy next character as-is. */
                    srcptr++;
                    if (*srcptr == '\0')
                        ereturn(escontext,false,
                                    (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                     errmsg("malformed array literal: \"%s\"",
                                            origStr)));
  					*dstptr++ = *srcptr++;
					/* Treat the escaped character as non-whitespace */
					leadingspace = false;
					dstendptr = dstptr;
					hasquoting = true;	/* can't be a NULL marker */
					break;
                case '"': 
                    in_quotes   = !in_quotes;
                    if (in_quotes)
                        leadingspace = false;
                    else
                    {
						/*
						 * Advance dstendptr when we exit in_quotes; this
						 * saves having to do it in all the other in_quotes
						 * cases.
                         */                        
                        dstendptr   = dstptr;
                    }
                    hasquoting  = true;     /* can't be a NULL marker */
                    srcptr++;
                    break;
                case '{':
                    if (!in_quotes)
                    {
                        srcptr++;
                    }
                    else
                        *dstptr++   = *srcptr++;
                    break;
                case '}': 
                    if (!in_quotes)
                    {
						eoArray	= itemdone = true;
						srcptr++;
                    }
                    else
                        *dstptr++   = *srcptr++;
                    break;
                default :
                    if(in_quotes)
                        *dstptr++ = *srcptr++;
                    else if (*srcptr == typdelim)
                    {
                        itemdone = true;                        
                        srcptr ++;
                    }
                    else if (set_isspace(*srcptr))
                    {
                        /*
                        * If leading space, drop it immediately.  Else, copy
                        * but don't advance dstendptr.
                        */
                        if(leadingspace)
                            srcptr++;
                        else
                            *dstptr++ = *srcptr++;            
                    }
                    else
                    {
                        *dstptr++   = *srcptr++;
                        leadingspace    = false;
                        dstendptr       = dstptr;
                    }             
                    break;
            }
        }   
        Assert(dstptr   < srcptr);
        *dstendptr  = '\0';
        elog(INFO,"line [%04d] itemstart:|%s|",__LINE__,itemstart);        
    }
	return	 true;	
}            

typedef enum
{
	SET_NO_LEVEL,
	SET_LEVEL_STARTED,
	SET_ELEM_STARTED,
	SET_ELEM_COMPLETED,
	SET_QUOTED_ELEM_STARTED,
	SET_QUOTED_ELEM_COMPLETED,
	SET_ELEM_DELIMITED,
	SET_LEVEL_COMPLETED,
	SET_LEVEL_DELIMITED
} SetParseState;

/*
 * SetCount
 *	 Determines the dimensions for an array string.
 *
 * Returns number of dimensions as function result.  The axis lengths are
 * returned in dim[], which must be of size MAXDIM.
 *
 * If we detect an error, fill *escontext with error details and return -1
 * (unless escontext isn't provided, in which case errors will be thrown).
 */

static	int
SetCount(const char *str, char typdelim, Node *escontext)
{
	int			nest_level = 0,
				nelems  = 1;
	bool		in_quotes = false;
	bool		eoArray = false;
	bool		empty_array = true;
	const char *ptr;
	SetParseState parse_state = SET_NO_LEVEL;

	ptr = str;
	while (!eoArray)
	{
		bool		itemdone = false;

		while (!itemdone)
		{
			if (parse_state == SET_ELEM_STARTED ||
				parse_state == SET_QUOTED_ELEM_STARTED)
				empty_array = false;

			switch (*ptr)
			{
				case '\0':
					/* Signal a premature end of the string */
					ereturn(escontext, -1,
							(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
							 errmsg("malformed array literal: \"%s\"", str),
							 errdetail("Unexpected end of input.")));
				case '\\':

					/*
					 * An escape must be after a level start, after an element
					 * start, or after an element delimiter. In any case we
					 * now must be past an element start.
					 */
					if (parse_state != SET_LEVEL_STARTED &&
						parse_state != SET_ELEM_STARTED &&
						parse_state != SET_QUOTED_ELEM_STARTED &&
						parse_state != SET_ELEM_DELIMITED)
						ereturn(escontext, -1,
								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								 errmsg("malformed array literal: \"%s\"", str),
								 errdetail("Unexpected \"%c\" character.",
										   '\\')));
					if (parse_state != SET_QUOTED_ELEM_STARTED)
						parse_state = SET_ELEM_STARTED;
					/* skip the escaped character */
					if (*(ptr + 1))
						ptr++;
					else
						ereturn(escontext, -1,
								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								 errmsg("malformed array literal: \"%s\"", str),
								 errdetail("Unexpected end of input.")));
					break;
				case '"':

					/*
					 * A quote must be after a level start, after a quoted
					 * element start, or after an element delimiter. In any
					 * case we now must be past an element start.
					 */
					if (parse_state != SET_LEVEL_STARTED &&
						parse_state != SET_QUOTED_ELEM_STARTED &&
						parse_state != SET_ELEM_DELIMITED)
						ereturn(escontext, -1,
								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								 errmsg("malformed array literal: \"%s\"", str),
								 errdetail("Unexpected array element.")));
					in_quotes = !in_quotes;
					if (in_quotes)
						parse_state = SET_QUOTED_ELEM_STARTED;
					else
						parse_state = SET_QUOTED_ELEM_COMPLETED;
					break;
				case '{':
					if (!in_quotes)
					{
						/*
						 * A left brace can occur if no nesting has occurred
						 * yet, after a level start, or after a level
						 * delimiter.
						 */
						if (parse_state != SET_NO_LEVEL &&
							parse_state != SET_LEVEL_STARTED &&
							parse_state != SET_LEVEL_DELIMITED)
							ereturn(escontext, -1,
									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
									 errmsg("malformed array literal: \"%s\"", str),
									 errdetail("Unexpected \"%c\" character.",
											   '{')));
						parse_state = SET_LEVEL_STARTED;
						if (nest_level >= 1)
							ereturn(escontext, -1,
									(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
									 errmsg("number of array dimensions (%d) exceeds the maximum allowed (%d)",
											nest_level + 1, 1)));
						nest_level++;
					}
					break;
				case '}':
					if (!in_quotes)
					{
						/*
						 * A right brace can occur after an element start, an
						 * element completion, a quoted element completion, or
						 * a level completion.
						 */
						if (parse_state != SET_ELEM_STARTED &&
							parse_state != SET_ELEM_COMPLETED &&
							parse_state != SET_QUOTED_ELEM_COMPLETED &&
							parse_state != SET_LEVEL_COMPLETED &&
							!(nest_level == 1 && parse_state == SET_LEVEL_STARTED))
							ereturn(escontext, -1,
									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
									 errmsg("malformed array literal: \"%s\"", str),
									 errdetail("Unexpected \"%c\" character.",
											   '}')));
						parse_state = SET_LEVEL_COMPLETED;
						if (nest_level == 0)
							ereturn(escontext, -1,
									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
									 errmsg("malformed array literal: \"%s\"", str),
									 errdetail("Unmatched \"%c\" character.", '}')));
						nest_level--;

						if (nest_level == 0)
							eoArray = itemdone = true;
					}
					break;
				default:
					if (!in_quotes)
					{
						if (*ptr == typdelim)
						{
							/*
							 * Delimiters can occur after an element start, an
							 * element completion, a quoted element
							 * completion, or a level completion.
							 */
							if (parse_state != SET_ELEM_STARTED &&
								parse_state != SET_ELEM_COMPLETED &&
								parse_state != SET_QUOTED_ELEM_COMPLETED &&
								parse_state != SET_LEVEL_COMPLETED)
								ereturn(escontext, -1,
										(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
										 errmsg("malformed array literal: \"%s\"", str),
										 errdetail("Unexpected \"%c\" character.",
												   typdelim)));
							if (parse_state == SET_LEVEL_COMPLETED)
								parse_state = SET_LEVEL_DELIMITED;
							else
								parse_state = SET_ELEM_DELIMITED;
							itemdone = true;
							nelems++;
						}
						else if (!set_isspace(*ptr))
						{
							/*
							 * Other non-space characters must be after a
							 * level start, after an element start, or after
							 * an element delimiter. In any case we now must
							 * be past an element start.
							 */
							if (parse_state != SET_LEVEL_STARTED &&
								parse_state != SET_ELEM_STARTED &&
								parse_state != SET_ELEM_DELIMITED)
								ereturn(escontext, -1,
										(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
										 errmsg("malformed array literal: \"%s\"", str),
										 errdetail("Unexpected array element.")));
							parse_state = SET_ELEM_STARTED;
						}
					}
					break;
			}
			if (!itemdone)
				ptr++;
		}
		ptr++;
	}

	/* only whitespace is allowed after the closing brace */
	while (*ptr)
	{
		if (!set_isspace(*ptr++))
			ereturn(escontext, -1,
					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
					 errmsg("malformed array literal: \"%s\"", str),
					 errdetail("Junk after closing right brace.")));
	}
    
	/* special case for an empty array */
	if (empty_array)
		return 0;
    else
        return nelems;
}

Re: Do we want a hashset type?

Reply via email to