From c3b3ad5d352920d2383122cb915e49e3d7d64cc7 Mon Sep 17 00:00:00 2001
From: Grisha Levit <grishalevit@gmail.com>
Date: Wed, 28 Jun 2023 17:57:03 -0400
Subject: [PATCH 1/2] <<# indent-stripping heredoc

command.h,{copy,dispose,make,print}_cmd.c,parse.y,redir.c
- rename r_deblank_reading_until to r_detab_reading_until. Reflects
  change in `<<-' functionality made back in bash-1.11

command.h
- enum r_instruction: new value r_unindent_reading_until for <<# heredoc

parse.y
- add new LESS_LESS_HASH token, produced by <<# redirection

make_cmd.c
- make_here_document: kill_leading now takes on value of 2 for <<# doc.
  When reading <<# doc body, count columns of leading blanks on first
  line (tabs are treated as moving up to the next 8-char-wide tab stop)
  For subsequent lines, strip up to that num cols of leading whitespace

{copy,dispose,make,print}_cmd.c,redir.c
- handle r_unindent_reading_until

doc/{bash.1,bashref.texi}
- document `<<#'

tests/heredoc{10.sub,.tests,.right}
- new test script for <<# heredoc. Produces same output as ksh93
---
 command.h           |   4 +-
 copy_cmd.c          |   3 +-
 dispose_cmd.c       |   3 +-
 doc/bash.1          |   8 ++-
 doc/bashref.texi    |   8 ++-
 make_cmd.c          |  49 ++++++++++++++----
 parse.y             |  34 ++++++++++--
 print_cmd.c         |  24 +++++----
 redir.c             |   6 ++-
 tests/heredoc.right | 121 ++++++++++++++++++++++++++++++++++++++++++-
 tests/heredoc.tests |   3 ++
 tests/heredoc10.sub | 122 ++++++++++++++++++++++++++++++++++++++++++++
 12 files changed, 351 insertions(+), 34 deletions(-)
 create mode 100755 tests/heredoc10.sub

diff --git a/command.h b/command.h
index 025a18c4..4d8a560a 100644
--- a/command.h
+++ b/command.h
@@ -28,11 +28,11 @@
 enum r_instruction {
   r_output_direction, r_input_direction, r_inputa_direction,
   r_appending_to, r_reading_until, r_reading_string,
-  r_duplicating_input, r_duplicating_output, r_deblank_reading_until,
+  r_duplicating_input, r_duplicating_output, r_detab_reading_until,
   r_close_this, r_err_and_out, r_input_output, r_output_force,
   r_duplicating_input_word, r_duplicating_output_word,
   r_move_input, r_move_output, r_move_input_word, r_move_output_word,
-  r_append_err_and_out
+  r_append_err_and_out, r_unindent_reading_until
 };
 
 /* Redirection flags; values for rflags */
diff --git a/copy_cmd.c b/copy_cmd.c
index 6426c016..ffae0409 100644
--- a/copy_cmd.c
+++ b/copy_cmd.c
@@ -124,7 +124,8 @@ copy_redirect (REDIRECT *redirect)
   switch (redirect->instruction)
     {
     case r_reading_until:
-    case r_deblank_reading_until:
+    case r_detab_reading_until:
+    case r_unindent_reading_until:
       new_redirect->here_doc_eof = redirect->here_doc_eof ? savestring (redirect->here_doc_eof) : 0;
       /*FALLTHROUGH*/
     case r_reading_string:
diff --git a/dispose_cmd.c b/dispose_cmd.c
index d2821fed..0c857502 100644
--- a/dispose_cmd.c
+++ b/dispose_cmd.c
@@ -319,7 +319,8 @@ dispose_redirects (REDIRECT *list)
       switch (t->instruction)
 	{
 	case r_reading_until:
-	case r_deblank_reading_until:
+	case r_detab_reading_until:
+	case r_unindent_reading_until:
 	  free (t->here_doc_eof);
 	/*FALLTHROUGH*/
 	case r_reading_string:
diff --git a/doc/bash.1 b/doc/bash.1
index 09e25ecf..d76b959e 100644
--- a/doc/bash.1
+++ b/doc/bash.1
@@ -4407,7 +4407,13 @@ If the redirection operator is
 then all leading tab characters are stripped from input lines and the
 line containing
 .IR delimiter .
-This allows
+If the redirection operator is
+.BR <<# ,
+then all leading blank characters are stripped from the first input
+line and indentation of at most that width is stripped from subsequent
+input lines and the line containing
+.IR delimiter .
+These allow
 here-documents within shell scripts to be indented in a
 natural fashion.
 .SS "Here Strings"
diff --git a/doc/bashref.texi b/doc/bashref.texi
index 9f91dccb..a2c90dd3 100644
--- a/doc/bashref.texi
+++ b/doc/bashref.texi
@@ -3226,7 +3226,7 @@ input (or file descriptor @var{n} if @var{n} is specified) for a command.
 
 The format of here-documents is:
 @example
-[@var{n}]<<[@minus{}]@var{word}
+[@var{n}]<<[@minus{}|#]@var{word}
         @var{here-document}
 @var{delimiter}
 @end example
@@ -3249,7 +3249,11 @@ must be used to quote the characters
 If the redirection operator is @samp{<<-},
 then all leading tab characters are stripped from input lines and the
 line containing @var{delimiter}.
-This allows here-documents within shell scripts to be indented in a
+If the redirection operator is @samp{<<#},
+then all leading blank characters are stripped from the first input
+line and indentation of at most that width is stripped from subsequent
+input lines and the line containing @var{delimiter}.
+These allow here-documents within shell scripts to be indented in a
 natural fashion.
 
 @subsection Here Strings
diff --git a/make_cmd.c b/make_cmd.c
index ea2e18ba..9f774e3f 100644
--- a/make_cmd.c
+++ b/make_cmd.c
@@ -504,10 +504,11 @@ make_simple_command (ELEMENT element, COMMAND *command)
 }
 
 /* Because we are Bourne compatible, we read the input for this
-   << or <<- redirection now, from wherever input is coming from.
+   << or <<[-#] redirection now, from wherever input is coming from.
    We store the input read into a WORD_DESC.  Replace the text of
    the redirectee.word with the new input text.  If <<- is on,
-   then remove leading TABS from each line. */
+   then remove leading TABS from each line.  If <<# is on, then
+   remove indentation from each line. */
 void
 make_here_document (REDIRECT *temp, int lineno)
 {
@@ -516,16 +517,25 @@ make_here_document (REDIRECT *temp, int lineno)
   char *redir_word, *document, *full_line;
   int document_index, delim_unquoted;
   size_t document_size;
+  ssize_t kill_cols;
 
-  if (temp->instruction != r_deblank_reading_until &&
-      temp->instruction != r_reading_until)
+  switch (temp->instruction)
     {
+    case r_reading_until:
+      kill_leading = 0;
+      break;
+    case r_detab_reading_until:
+      kill_leading = 1;		/* remove leading tabs */
+      break;
+    case r_unindent_reading_until:
+      kill_leading = 2;		/* remove indentation */
+      kill_cols = -1;		/* width of indentation to remove */
+      break;
+    default:
       internal_error (_("make_here_document: bad instruction type %d"), temp->instruction);
       return;
     }
 
-  kill_leading = temp->instruction == r_deblank_reading_until;
-
   full_line = document = (char *)NULL;
   document_index = 0;
   document_size = 0;
@@ -558,7 +568,7 @@ make_here_document (REDIRECT *temp, int lineno)
 
   /* Read lines from wherever lines are coming from.
      For each line read, if kill_leading, then kill the
-     leading tab characters.
+     leading tab characters or indentation.
      If the line matches redir_word exactly, then we have
      manufactured the document.  Otherwise, add the line to the
      list of lines in the document. */
@@ -580,16 +590,34 @@ make_here_document (REDIRECT *temp, int lineno)
       if (echo_input_at_read)
 	fprintf (stderr, "%s", line);
 
-      if (kill_leading && *line)
+      if (kill_leading == 1 && *line)
 	{
 	  /* Hack:  To be compatible with some Bourne shells, we
-	     check the word before stripping the whitespace.  This
+	     check the word before stripping the tabs.  This
 	     is a hack, though. */
 	  if (STREQN (line, redir_word, redir_len) && line[redir_len] == '\n')
 	    break;
 
 	  while (*line == '\t')
 	    line++;
+        }
+      else if (kill_leading == 2)
+	{
+	  size_t cols;
+
+	  /* If kill_cols == -1, we are reading the first line and counting how
+             many columns of indentation are formed by the leading blanks.
+	     Otherwise, kill_cols is the width of the indentation of the first
+             line, so skip leading blanks totalling at most kill_cols width. */
+	  for (cols = 0; shellblank (*line); line++)
+	    {
+	      cols += *line == '\t' ? (8 - cols % 8) : 1;
+	      if (kill_cols != -1 && cols > kill_cols)
+		break;
+	    }
+
+	  if (kill_cols == -1)
+	    kill_cols = cols;
 	}
 
       if (*line == 0)
@@ -679,7 +707,8 @@ make_redirection (REDIRECTEE source, enum r_instruction instruction, REDIRECTEE
       temp->flags = O_RDWR | O_CREAT;
       break;
 
-    case r_deblank_reading_until: 	/* <<-foo */
+    case r_detab_reading_until: 	/* <<-foo */
+    case r_unindent_reading_until: 	/* <<#foo */
     case r_reading_until:		/* << foo */
     case r_reading_string:		/* <<< foo */
     case r_close_this:			/* <&- */
diff --git a/parse.y b/parse.y
index 6701d596..223b846a 100644
--- a/parse.y
+++ b/parse.y
@@ -382,7 +382,7 @@ static FILE *yyerrstream;
 %token AND_AND OR_OR GREATER_GREATER LESS_LESS LESS_AND LESS_LESS_LESS
 %token GREATER_AND SEMI_SEMI SEMI_AND SEMI_SEMI_AND
 %token LESS_LESS_MINUS AND_GREATER AND_GREATER_GREATER LESS_GREATER
-%token GREATER_BAR BAR_AND
+%token GREATER_BAR BAR_AND LESS_LESS_HASH
 
 /* Special; never created by yylex; only set by parse_comsub and xparse_dolparen */
 %token DOLPAREN
@@ -629,21 +629,42 @@ redirection:	'>' WORD
 			{
 			  source.dest = 0;
 			  redir.filename = $2;
-			  $$ = make_redirection (source, r_deblank_reading_until, redir, 0);
+			  $$ = make_redirection (source, r_detab_reading_until, redir, 0);
 			  push_heredoc ($$);
 			}
 	|	NUMBER LESS_LESS_MINUS WORD
 			{
 			  source.dest = $1;
 			  redir.filename = $3;
-			  $$ = make_redirection (source, r_deblank_reading_until, redir, 0);
+			  $$ = make_redirection (source, r_detab_reading_until, redir, 0);
 			  push_heredoc ($$);
 			}
 	|	REDIR_WORD  LESS_LESS_MINUS WORD
 			{
 			  source.filename = $1;
 			  redir.filename = $3;
-			  $$ = make_redirection (source, r_deblank_reading_until, redir, REDIR_VARASSIGN);
+			  $$ = make_redirection (source, r_detab_reading_until, redir, REDIR_VARASSIGN);
+			  push_heredoc ($$);
+			}
+	|	LESS_LESS_HASH WORD
+			{
+			  source.dest = 0;
+			  redir.filename = $2;
+			  $$ = make_redirection (source, r_unindent_reading_until, redir, 0);
+			  push_heredoc ($$);
+			}
+	|	NUMBER LESS_LESS_HASH WORD
+			{
+			  source.dest = $1;
+			  redir.filename = $3;
+			  $$ = make_redirection (source, r_unindent_reading_until, redir, 0);
+			  push_heredoc ($$);
+			}
+	|	REDIR_WORD  LESS_LESS_HASH WORD
+			{
+			  source.filename = $1;
+			  redir.filename = $3;
+			  $$ = make_redirection (source, r_unindent_reading_until, redir, REDIR_VARASSIGN);
 			  push_heredoc ($$);
 			}
 	|	LESS_LESS_LESS WORD
@@ -2292,6 +2313,7 @@ STRING_INT_ALIST other_token_alist[] = {
   { ";&", SEMI_AND },
   { ";;&", SEMI_SEMI_AND },
   { "<<-", LESS_LESS_MINUS },
+  { "<<#", LESS_LESS_HASH },
   { "<<<", LESS_LESS_LESS },
   { "&>", AND_GREATER },
   { "&>>", AND_GREATER_GREATER },
@@ -3030,7 +3052,7 @@ static int open_brace_count;
   (token == '<' || token == '>' || \
    token == GREATER_GREATER || token == GREATER_BAR || \
    token == LESS_GREATER || token == LESS_LESS_MINUS || \
-   token == LESS_LESS || token == LESS_LESS_LESS || \
+   token == LESS_LESS || token == LESS_LESS_LESS || token == LESS_LESS_HASH || \
    token == LESS_AND || token == GREATER_AND || token == AND_GREATER)
 
 /* Is `token' one that will allow a WORD to be read in a command position?
@@ -3564,6 +3586,8 @@ read_token (int command)
 	      peek_char = shell_getc (1);
 	      if MBTEST(peek_char == '-')
 		return (LESS_LESS_MINUS);
+	      else if MBTEST(peek_char == '#')
+		return (LESS_LESS_HASH);
 	      else if MBTEST(peek_char == '<')
 		return (LESS_LESS_LESS);
 	      else
diff --git a/print_cmd.c b/print_cmd.c
index 30e354d3..ac18109b 100644
--- a/print_cmd.c
+++ b/print_cmd.c
@@ -1039,7 +1039,8 @@ print_redirection_list (REDIRECT *redirects)
     {
       /* Defer printing the here document bodies until we've printed the rest of the
          redirections, but print the headers in the order they're given.  */
-      if (redirects->instruction == r_reading_until || redirects->instruction == r_deblank_reading_until)
+      if (redirects->instruction == r_reading_until || redirects->instruction == r_detab_reading_until
+          || redirects->instruction == r_unindent_reading_until)
 	{
 	  newredir = copy_redirect (redirects);
 	  newredir->next = (REDIRECT *)NULL;
@@ -1077,26 +1078,30 @@ print_redirection_list (REDIRECT *redirects)
 static void
 print_heredoc_header (REDIRECT *redirect)
 {
-  int kill_leading;
-  char *x;
-
-  kill_leading = redirect->instruction == r_deblank_reading_until;
-
   /* Here doc header */
   if (redirect->rflags & REDIR_VARASSIGN)
     cprintf ("{%s}", redirect->redirector.filename->word);
   else if (redirect->redirector.dest != 0)
     cprintf ("%d", redirect->redirector.dest);
 
+  if (redirect->instruction == r_detab_reading_until)
+    cprintf ("<<-");
+  else if (redirect->instruction == r_unindent_reading_until)
+    cprintf ("<<#");
+  else
+    cprintf ("<<");
+
   /* If the here document delimiter is quoted, single-quote it. */
   if (redirect->redirectee.filename->flags & W_QUOTED)
     {
+      char *x;
+
       x = sh_single_quote (redirect->here_doc_eof);
-      cprintf ("<<%s%s", kill_leading ? "-" : "", x);
+      cprintf ("%s", x);
       free (x);
     }
   else
-    cprintf ("<<%s%s", kill_leading ? "-" : "", redirect->here_doc_eof);
+    cprintf ("%s", redirect->here_doc_eof);
 }
 
 static void
@@ -1164,8 +1169,9 @@ print_redirection (REDIRECT *redirect)
       cprintf ("<> %s", redirectee->word);
       break;
 
-    case r_deblank_reading_until:
     case r_reading_until:
+    case r_detab_reading_until:
+    case r_unindent_reading_until:
       print_heredoc_header (redirect);
       cprintf ("\n");
       print_heredoc_body (redirect);
diff --git a/redir.c b/redir.c
index 267a9fc8..afc73585 100644
--- a/redir.c
+++ b/redir.c
@@ -1010,7 +1010,8 @@ do_redirection_internal (REDIRECT *redirect, int flags, char **fnp)
       break;
 
     case r_reading_until:
-    case r_deblank_reading_until:
+    case r_detab_reading_until:
+    case r_unindent_reading_until:
     case r_reading_string:
       /* REDIRECTEE is a pointer to a WORD_DESC containing the text of
 	 the new input.  Place it in a temporary file. */
@@ -1353,7 +1354,8 @@ stdin_redirection (enum r_instruction ri, int redirector)
     case r_inputa_direction:
     case r_input_output:
     case r_reading_until:
-    case r_deblank_reading_until:
+    case r_detab_reading_until:
+    case r_unindent_reading_until:
     case r_reading_string:
       return (1);
     case r_duplicating_input:
diff --git a/tests/heredoc.right b/tests/heredoc.right
index b7042ff9..ee72b755 100644
--- a/tests/heredoc.right
+++ b/tests/heredoc.right
@@ -149,7 +149,126 @@ HERE
         echo 3 4;
     done
 }
+4SP
+4SP
+	TAB
+    8SP
+	1SP TAB
+	 TAB 1SP
+     9SP
+	 1SP TAB 1SP
+		TAB TAB
+	 	TAB 1SP TAB
+...
+TAB
+4SP
+TAB
+8SP
+1SP TAB
+ TAB 1SP
+ 9SP
+ 1SP TAB 1SP
+	TAB TAB
+ 	TAB 1SP TAB
+...
+8SP
+4SP
+TAB
+8SP
+1SP TAB
+ TAB 1SP
+ 9SP
+ 1SP TAB 1SP
+	TAB TAB
+ 	TAB 1SP TAB
+...
+1SP TAB
+4SP
+TAB
+8SP
+1SP TAB
+ TAB 1SP
+ 9SP
+ 1SP TAB 1SP
+	TAB TAB
+ 	TAB 1SP TAB
+...
+TAB 1SP
+4SP
+TAB
+8SP
+1SP TAB
+TAB 1SP
+9SP
+1SP TAB 1SP
+	TAB TAB
+	TAB 1SP TAB
+...
+9SP
+4SP
+TAB
+8SP
+1SP TAB
+TAB 1SP
+9SP
+1SP TAB 1SP
+	TAB TAB
+	TAB 1SP TAB
+...
+1SP TAB 1SP
+4SP
+TAB
+8SP
+1SP TAB
+TAB 1SP
+9SP
+1SP TAB 1SP
+	TAB TAB
+	TAB 1SP TAB
+...
+TAB TAB
+4SP
+TAB
+8SP
+1SP TAB
+TAB 1SP
+9SP
+1SP TAB 1SP
+TAB TAB
+TAB 1SP TAB
+...
+TAB 1SP TAB
+4SP
+TAB
+8SP
+1SP TAB
+TAB 1SP
+9SP
+1SP TAB 1SP
+TAB TAB
+TAB 1SP TAB
+...
+...
+--
+ E2
+...
+--
+...
+--
+...
+S1
+...
+S2
+  S2
+...
+S3
+...
+S4
+...
+S5
+S5
+...
 comsub here-string
-./heredoc.tests: line 159: warning: here-document at line 157 delimited by end-of-file (wanted `EOF')
+./heredoc.tests: line 162: warning: here-document at line 160 delimited by end-of-file (wanted `EOF')
 hi
 there
diff --git a/tests/heredoc.tests b/tests/heredoc.tests
index d3da798d..eb45144e 100644
--- a/tests/heredoc.tests
+++ b/tests/heredoc.tests
@@ -147,6 +147,9 @@ ${THIS_SH} ./heredoc8.sub
 # various tests for printing here-documents in function bodies
 ${THIS_SH} ./heredoc9.sub
 
+# tests for <<# whitespace-skipping
+${THIS_SH} ./heredoc10.sub
+
 echo $(
 	cat <<< "comsub here-string"
 )
diff --git a/tests/heredoc10.sub b/tests/heredoc10.sub
new file mode 100755
index 00000000..c28f1db9
--- /dev/null
+++ b/tests/heredoc10.sub
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+### This script produces the same output for bash and ksh (93u+m/1.0.6)
+
+
+### Tests for stripping leading blanks and respecting tabstop alignment
+
+chkdoc() {
+eval "
+cat <<# EOF
+$1
+    4SP
+	TAB
+        8SP
+ 	1SP TAB
+	 TAB 1SP
+         9SP
+ 	 1SP TAB 1SP
+		TAB TAB
+	 	TAB 1SP TAB
+EOF
+echo ..."
+}
+
+# check various indentation of first heredoc line
+chkdoc '    4SP'
+chkdoc '	TAB'
+chkdoc '        8SP'
+chkdoc ' 	1SP TAB'
+chkdoc '	 TAB 1SP'
+chkdoc '         9SP'
+chkdoc ' 	 1SP TAB 1SP'
+chkdoc '		TAB TAB'
+chkdoc '	 	TAB 1SP TAB'
+
+
+### Tests for recognizing delimeter
+
+# recognize delim on first line after stripping any indentation
+eval '
+cat <<# E1
+ E1
+echo ...
+'
+
+# do not recognize delim on a line with more indentation than on first line
+eval '
+cat <<# E2
+--
+ E2
+E2
+echo ...
+'
+
+# recognize delim on a line with same indentation as on first line
+eval '
+cat <<# E3
+ --
+ E3
+echo ...
+'
+
+# recognize delim on a line with less indentation than on first line
+eval '
+cat <<# E4
+  --
+ E4
+echo ...
+'
+
+### Tests for delimeter with leading blanks
+
+# If the delim has leading blanks, interpretation of the first line of the
+# heredoc is ambiguous in the case that it ends with the delim.
+# Like ksh93, treat any leading blanks on the first line as specifying the
+# heredoc's indentation, not as part of the delim.
+
+# recognize delim w/ leading blanks if first line has no indentation
+eval '
+cat <<# " S1"
+S1
+ S1
+echo ...
+'
+
+# do not recognize delim w/ leading blanks on a line with more indentation than
+# on first line
+eval '
+cat <<# " S2"
+S2
+  S2
+ S2
+echo ...
+'
+
+# recognize delim w/ leading blanks on subsequent lines after stripping
+# indentation found on first line
+eval '
+cat <<# " S3"
+  S3
+   S3
+echo ...
+'
+
+# do not recognize delim w/ leading blanks on first line even if first line
+# exactly matches the delim
+eval '
+cat <<# " S4"
+ S4
+  S4
+echo ...
+'
+
+# if first line had indentation, do not recognize delim w/ leading blanks on
+# subsequent lines that exactly match delim before stripping indentation
+eval '
+cat <<# " S5"
+ S5
+ S5
+  S5
+echo ...
+'
-- 
2.41.0

