>From 934b234dfc48a9ed44ebca149a0db1518c6975c8 Mon Sep 17 00:00:00 2001
From: Robert Dubner <[email protected]>
Date: Thu, 26 Feb 2026 14:42:51 -0500
Subject: [PATCH] cobol: Fix FUNCTION TRIM.
The FUNCTION TRIM now works properly with UTF16 inputs.
According to the ISO specification, the return type of a number of
intrinsic functions is defined by the variable type of their first
parameter. A number of changes here cause more functions to honor that
requirement.
gcc/cobol/ChangeLog:
* parse.y: BASECONVERT and TRIM take their type from their first
parameter.
* parse_util.h (intrinsic_return_field): The function_descrs[] is
adjusted so that a number of functions take their return type from
their first calling parameter. intrinsic_return_field() has been
refined.
* symbols.cc (new_alphanumeric): Use set_explicit() instead of
set() in support of refined intrinsic function return type.
libgcobol/ChangeLog:
* intrinsic.cc (__gg__trim): Rewritten to work properly, and
avoid
unnecessary variable codeset encoding translation.
---
gcc/cobol/parse.y | 4 +-
gcc/cobol/parse_util.h | 86 +++++++++++++++++++++---------
gcc/cobol/symbols.cc | 22 +++++---
libgcobol/intrinsic.cc | 115 ++++++++++++++++++-----------------------
4 files changed, 128 insertions(+), 99 deletions(-)
diff --git a/gcc/cobol/parse.y b/gcc/cobol/parse.y
index 3ab0daa4c18..df7f29f9ce8 100644
--- a/gcc/cobol/parse.y
+++ b/gcc/cobol/parse.y
@@ -10892,7 +10892,7 @@ intrinsic: function_udf
| BASECONVERT '(' varg[r1] varg[r2] varg[r3] ')' {
location_set(@1);
- $$ = new_alphanumeric("BASECONVERT");
+ $$ = new_alphanumeric("BASECONVERT",
$r1->field->codeset.encoding);
cbl_unimplemented("BASECONVERT");
if( ! intrinsic_call_3($$, BASECONVERT, $r1, $r2, $r3
)) YYERROR;
}
@@ -11223,7 +11223,7 @@ intrinsic: function_udf
YYERROR;
break;
}
- $$ = new_alphanumeric("TRIM");
+ $$ = new_alphanumeric("TRIM",
$r1->field->codeset.encoding);
cbl_refer_t * how = new_reference($trim_trailing);
if( ! intrinsic_call_2($$, TRIM, $r1, how) ) YYERROR;
}
diff --git a/gcc/cobol/parse_util.h b/gcc/cobol/parse_util.h
index e3bcc79a68f..0537c60b5fd 100644
--- a/gcc/cobol/parse_util.h
+++ b/gcc/cobol/parse_util.h
@@ -46,15 +46,22 @@
* n variadic
* We use just A, I, N, or X, choosing the most general for each
parameter.
*
- * When FldInvalid is shown as the return type, it indicates that the
type
- * of the function is determined by the type of the first parameter.
+ * When FldInvalid is shown as the return type, it indicates that the
Integer
+ * vs. Numeric type of the function is determined by the type of the
first
+ * parameter.
+ *
+ * FldGroup is used when the first argument determines the encoding of
the
+ * temporary. This is for functions that can be Alphanumeric or
National.
*
* We use FldNumericBin5 for functions of type "Integer", and FldFloat
for
* functions of type "Numeric",
*/
+ #define IntOrNum FldInvalid
+ #define AnumOrNat FldGroup
+
static const function_descr_t function_descrs[] = {
{ ABS, "ABS",
- "__gg__abs", "N", {}, FldInvalid },
+ "__gg__abs", "N", {}, IntOrNum },
{ ACOS, "ACOS",
"__gg__acos", "N", {}, FldFloat },
{ ANNUITY, "ANNUITY",
@@ -63,7 +70,7 @@ static const function_descr_t function_descrs[] = {
"__gg__asin", "N", {}, FldFloat },
{ ATAN, "ATAN",
"__gg__atan", "N", {}, FldFloat },
- { BASECONVERT, "BASECONVERT",
+ { BASECONVERT, "BASECONVERT", // See
parse.y
"__gg__baseconvert", "XII", {}, FldAlphanumeric
},
{ BIT_OF, "BIT-OF",
"__gg__bit_of", "X", {}, FldAlphanumeric },
@@ -81,9 +88,9 @@ static const function_descr_t function_descrs[] = {
{ COMBINED_DATETIME, "COMBINED-DATETIME",
"__gg__combined_datetime", "IN", {}, FldFloat },
{ CONCAT, "CONCAT",
- "__gg__concat", "n", {}, FldAlphanumeric },
+ "__gg__concat", "n", {}, AnumOrNat },
{ CONVERT, "CONVERT",
- "__gg__convert", "XII", {}, FldAlphanumeric
},
+ "__gg__convert", "XII", {}, AnumOrNat },
{ COS, "COS",
"__gg__cos", "N", {}, FldFloat },
{ CURRENT_DATE, "CURRENT-DATE",
@@ -121,13 +128,13 @@ static const function_descr_t function_descrs[] = {
{ FIND_STRING, "FIND-STRING",
"__gg__find_string", "AXI", {}, FldNumericBin5
},
{ FORMATTED_CURRENT_DATE, "FORMATTED-CURRENT-DATE",
- "__gg__formatted_current_date", "X", {}, FldAlphanumeric },
+ "__gg__formatted_current_date", "X", {}, AnumOrNat },
{ FORMATTED_DATE, "FORMATTED-DATE",
- "__gg__formatted_date", "XX", {}, FldAlphanumeric },
+ "__gg__formatted_date", "XX", {}, AnumOrNat },
{ FORMATTED_DATETIME, "FORMATTED-DATETIME",
- "__gg__formatted_datetime", "XINI", {}, FldAlphanumeric },
+ "__gg__formatted_datetime", "XINI", {}, AnumOrNat },
{ FORMATTED_TIME, "FORMATTED-TIME",
- "__gg__formatted_time", "INI", {}, FldAlphanumeric },
+ "__gg__formatted_time", "INI", {}, AnumOrNat },
{ FRACTION_PART, "FRACTION-PART",
"__gg__fraction_part", "N", {}, FldFloat },
{ HEX_OF, "HEX-OF",
@@ -135,7 +142,7 @@ static const function_descr_t function_descrs[] = {
{ HEX_TO_CHAR, "HEX-TO-CHAR",
"__gg__hex_to_char", "X", {}, FldAlphanumeric },
{ HIGHEST_ALGEBRAIC, "HIGHEST-ALGEBRAIC",
- "__gg__highest_algebraic", "N", {}, FldInvalid },
+ "__gg__highest_algebraic", "N", {}, IntOrNum },
{ INTEGER, "INTEGER",
"__gg__integer", "N", {}, FldNumericBin5 },
// requires FldBoolean
@@ -164,11 +171,11 @@ static const function_descr_t function_descrs[] = {
{ LOG10, "LOG10",
"__gg__log10", "N", {}, FldFloat },
{ LOWER_CASE, "LOWER-CASE",
- "__gg__lower_case", "X", {}, FldAlphanumeric },
+ "__gg__lower_case", "X", {}, AnumOrNat },
{ LOWEST_ALGEBRAIC, "LOWEST-ALGEBRAIC",
- "__gg__lowest_algebraic", "N", {}, FldInvalid },
+ "__gg__lowest_algebraic", "N", {}, IntOrNum },
{ MAXX, "MAX",
- "__gg__max", "n", {}, FldInvalid },
+ "__gg__max", "n", {}, IntOrNum },
{ MEAN, "MEAN",
"__gg__mean", "n", {}, FldFloat },
{ MEDIAN, "MEDIAN",
@@ -176,7 +183,7 @@ static const function_descr_t function_descrs[] = {
{ MIDRANGE, "MIDRANGE",
"__gg__midrange", "n", {}, FldFloat },
{ MINN, "MIN",
- "__gg__min", "n", {}, FldInvalid },
+ "__gg__min", "n", {}, IntOrNum },
{ MOD, "MOD",
"__gg__mod", "IN", {}, FldNumericBin5 },
{ MODULE_NAME, "MODULE-NAME",
@@ -202,11 +209,11 @@ static const function_descr_t function_descrs[] = {
{ RANDOM, "RANDOM",
"__gg__random", "I", {}, FldFloat },
{ RANGE, "RANGE",
- "__gg__range", "n", {}, FldInvalid },
+ "__gg__range", "n", {}, IntOrNum },
{ REM, "REM",
"__gg__rem", "NN", {}, FldFloat },
{ REVERSE, "REVERSE",
- "__gg__reverse", "X", {}, FldAlphanumeric },
+ "__gg__reverse", "X", {}, AnumOrNat },
{ SECONDS_FROM_FORMATTED_TIME, "SECONDS-FROM-FORMATTED-TIME",
"__gg__seconds_from_formatted_time", "XX", {}, FldFloat },
{ SECONDS_PAST_MIDNIGHT, "SECONDS_PAST_MIDNIGHT",
@@ -216,7 +223,7 @@ static const function_descr_t function_descrs[] = {
{ SIN, "SIN",
"__gg__sin", "N", {}, FldFloat },
{ SMALLEST_ALGEBRAIC, "SMALLEST-ALGEBRAIC",
- "__gg__smallest_algebraic", "N", {}, FldInvalid },
+ "__gg__smallest_algebraic", "N", {}, IntOrNum },
{ SQRT, "SQRT",
"__gg__sqrt", "N", {}, FldFloat },
{ STANDARD_COMPARE, "STANDARD-COMPARE",
@@ -224,9 +231,9 @@ static const function_descr_t function_descrs[] = {
{ STANDARD_DEVIATION, "STANDARD-DEVIATION",
"__gg__standard_deviation", "n", {}, FldFloat },
{ SUBSTITUTE, "SUBSTITUTE",
- "__gg__substitute", "XXX", {}, FldAlphanumeric
},
+ "__gg__substitute", "XXX", {}, AnumOrNat },
{ SUM, "SUM",
- "__gg__sum", "n", {}, FldInvalid },
+ "__gg__sum", "n", {}, IntOrNum },
{ TAN, "TAN",
"__gg__tan", "N", {}, FldFloat },
{ TEST_DATE_YYYYMMDD, "TEST-DATE-YYYYMMDD",
@@ -241,8 +248,8 @@ static const function_descr_t function_descrs[] = {
"__gg__test_numval_c", "XXU", {}, FldNumericBin5 },
{ TEST_NUMVAL_F, "TEST-NUMVAL-F",
"__gg__test_numval_f", "X", {}, FldNumericBin5 },
- { TRIM, "TRIM",
- "__gg__trim", "XI", {}, FldNumericBin5 },
+ { TRIM, "TRIM", // See
parse.y
+ "__gg__trim", "XI", {}, FldAlphanumeric },
{ ULENGTH, "ULENGTH",
"__gg__ulength", "X", {}, FldAlphanumeric },
{ UPOS, "UPOS",
@@ -342,7 +349,8 @@ intrinsic_return_field(int token,
std::vector<cbl_refer_t> args)
retval = new_tempnumeric_float();
break;
case FldInvalid:
- // This is a flag that a function takes the type of its first input
+ // This is a flag that a function takes the Numeric vs Int type of
its
+ // first argument
assert( args.size() );
switch(args[0].field->type)
{
@@ -350,7 +358,7 @@ intrinsic_return_field(int token,
std::vector<cbl_refer_t> args)
case FldAlphanumeric:
case FldAlphaEdited:
case FldLiteralA:
- retval = new_alphanumeric();
+ retval = new_alphanumeric(NULL,
args[0].field->codeset.encoding);
break;
case FldNumericBinary:
case FldPacked:
@@ -370,6 +378,36 @@ intrinsic_return_field(int token,
std::vector<cbl_refer_t> args)
break;
}
break;
+
+ case FldGroup:
+ // This is a flag that an alphanumeric function takes the encoding
of the
+ // first argument
+ assert( args.size() );
+ switch(args[0].field->type)
+ {
+ case FldGroup:
+ case FldAlphanumeric:
+ case FldAlphaEdited:
+ case FldLiteralA:
+ case FldNumericBinary:
+ case FldPacked:
+ case FldNumericDisplay:
+ case FldNumericBin5:
+ case FldLiteralN:
+ case FldIndex:
+ case FldPointer:
+ retval = new_alphanumeric(NULL,
args[0].field->codeset.encoding);
+ break;
+ case FldFloat:
+ retval = new_tempnumeric_float();
+ break;
+ default:
+ retval = NULL;
+ gcc_unreachable();
+ break;
+ }
+ break;
+
default:
retval = NULL;
gcc_unreachable();
diff --git a/gcc/cobol/symbols.cc b/gcc/cobol/symbols.cc
index 99d638e8c16..55c40ffa5ca 100644
--- a/gcc/cobol/symbols.cc
+++ b/gcc/cobol/symbols.cc
@@ -3783,14 +3783,22 @@ symbol_temporaries_free() {
cbl_field_t *
new_alphanumeric( const cbl_name_t name, cbl_encoding_t encoding ) {
cbl_field_t * field = new_temporary_impl(FldAlphanumeric, name);
+//// if( encoding != no_encoding_e ) {
+//// field->codeset.set(encoding);
+//// }
+//// //// Dubner hacking away: If name is non-null, then assume this is
a
+//// //// function definition, and force the codeset, which otherwise
will have
+//// //// defaulted to current_encoding('A'), and the valid() test in
codeset.set
+//// //// will have prevented it from being changed.
+//// if( name && encoding != no_encoding_e ) {
+//// field->codeset.set_explicit(encoding);
+//// }
+ /* Jim's original code was hedged with protections apparently intended
to
+ prevent encodings from changing. This proved unsatisfactor,
especially
+ when I started implementing setting the temporary return type of
functions
+ that take on the characteristics of their first parameter. So, I
went
+ from codeset.set_encoding() to codeset.set_explicit(). */
if( encoding != no_encoding_e ) {
- field->codeset.set(encoding);
- }
- //// Dubner hacking away: If name is non-null, then assume this is a
- //// function definition, and force the codeset, which otherwise will
have
- //// defaulted to current_encoding('A'), and the valid() test in
codeset.set
- //// will have prevented it from being changed.
- if( name && encoding != no_encoding_e ) {
field->codeset.set_explicit(encoding);
}
temporaries.add(field);
diff --git a/libgcobol/intrinsic.cc b/libgcobol/intrinsic.cc
index d3206b89679..8c1119e1dee 100644
--- a/libgcobol/intrinsic.cc
+++ b/libgcobol/intrinsic.cc
@@ -3527,9 +3527,11 @@ __gg__trim( cblc_field_t *dest,
size_t arg2_offset,
size_t arg2_size)
{
- cbl_encoding_t from = arg1->encoding;
- cbl_encoding_t to = dest->encoding;
- charmap_t *charmap = __gg__get_charmap(to);
+ // We assume that dest is an intermediate_e with the same encoding as
arg1.
+ assert( dest->type == FldAlphanumeric
+ && (dest->attr & intermediate_e)
+ && dest->encoding == arg1->encoding );
+ charmap_t *charmap = __gg__get_charmap(arg1->encoding);
int stride = charmap->stride();
cbl_char_t mapped_space = charmap->mapped_character(ascii_space);
@@ -3539,80 +3541,61 @@ __gg__trim( cblc_field_t *dest,
arg2_offset,
arg2_size);
//static const int BOTH = 0;
- static const int LEADING = 1; // Remove leading spaces
- static const int TRAILING = 2; // Remove trailing spaces
-
- if( dest->type != FldAlphanumeric ||
- !(dest->attr & intermediate_e) )
- {
- fprintf(stderr,
- "We expect the target of a FUNCTION TRIM to "
- "be an intermediate alphanumeric\n");
- abort();
- }
+ #define LEADING 1 // Remove leading spaces
+ #define TRAILING 2 // Remove trailing spaces
- // What is this all about?
- dest->capacity = dest->offset;
-
- // Make a copy of the input:
- char *copy = static_cast<char *>(malloc(arg1_size));
- massert(copy);
- memcpy(copy, arg1->data+arg1_offset, arg1_size);
-
- // Convert it to the destination encoding
- __gg__convert_encoding_length(copy, arg1_size, from, to);
-
- // No matter what, we want to find the leftmost non-space and the
- // rightmost non-space:
-
- char *left = copy;
- char *right = left + arg1_size-stride;
-
- // Find left and right: the first and last non-spaces
- while( left <= right )
+ char *left = reinterpret_cast<char *>(arg1->data) + arg1_offset;
+ char *right = left + arg1_size-stride; // Points AT the character, not
beyond
+ switch(type)
{
- cbl_char_t cleft = charmap->getch(left, (size_t)0);
- cbl_char_t cright = charmap->getch(right, (size_t)0);
-
- if( cleft != mapped_space && cright != mapped_space )
- {
+ case 0: // Strip off leading and trailing spaces
+ while(left <= right)
+ {
+ if( charmap->getch(left, (size_t)0) != mapped_space )
+ {
+ break;
+ }
+ left += stride;
+ }
+ while(left <= right)
+ {
+ if( charmap->getch(right, (size_t)0) != mapped_space )
+ {
+ break;
+ }
+ right -= stride;
+ }
break;
- }
- if( cleft == mapped_space )
+
+ case LEADING: // Just leading
{
- left += stride;
+ while(left <= right)
+ {
+ if( charmap->getch(left, (size_t)0) != mapped_space )
+ {
+ break;
+ }
+ left += stride;
+ }
+ break;
}
- if( cright == mapped_space )
+
+ case TRAILING: // Just trailing
{
- right -= stride;
+ while(left <= right)
+ {
+ if( charmap->getch(right, (size_t)0) != mapped_space )
+ {
+ break;
+ }
+ right -= stride;
+ }
+ break;
}
}
- if( type == LEADING )
- {
- // We want to leave any trailing spaces, so we return 'right' to its
- // original value:
- right = copy + arg1_size-1;
- }
- else if( type == TRAILING )
- {
- // We want to leave any leading spaces, so we return 'left' to its
- // original value:
- left = copy;
- }
-
- if( left > right )
- {
- // When the arg1 input string was empty, we want left to be right+1.
- // The left/right loop can sometimes end up with left equal to
right+2.
- // That needs to be fixed:
- left = right+stride;
- }
-
size_t ncount = right+stride - left;
__gg__adjust_dest_size(dest, ncount);
-
memmove(dest->data, left, ncount);
- free(copy);
}
#if HAVE_INITSTATE_R && HAVE_SRANDOM_R && HAVE_RANDOM_R
--
2.34.1