Module Name:    src
Committed By:   rillig
Date:           Sun Mar  7 11:32:06 UTC 2021

Modified Files:
        src/usr.bin/indent: indent.h lexi.c parse.c

Log Message:
indent: in debug mode, output detailed token information

The main ingredient for understanding how indent works is the tokenizer
and the 4 buffers in which the text is collected.

Inspecting this debug log for the test comment-line-end makes it obvious
why indent messes up code that contains '//' comments.  The cause is
that indent interprets '//' as an operator, just like '&&' or '||'.  The
sequence '/////' is interpreted as a single operator as well, by the
way.

Since '//' is interpreted as an ordinary operator, any words following
it are plain identifiers, usually several of them in a row, which is a
syntax error.  Depending on the context, the operator '//' is either a
unary operator (no space around) or a binary operator (space around).
This explains why the word 'line-end' is expanded to 'line - end'.

No functional change outside of debug mode.


To generate a diff of this commit:
cvs rdiff -u -r1.4 -r1.5 src/usr.bin/indent/indent.h
cvs rdiff -u -r1.19 -r1.20 src/usr.bin/indent/lexi.c
cvs rdiff -u -r1.13 -r1.14 src/usr.bin/indent/parse.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/usr.bin/indent/indent.h
diff -u src/usr.bin/indent/indent.h:1.4 src/usr.bin/indent/indent.h:1.5
--- src/usr.bin/indent/indent.h:1.4	Sun Mar  7 10:56:18 2021
+++ src/usr.bin/indent/indent.h	Sun Mar  7 11:32:05 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: indent.h,v 1.4 2021/03/07 10:56:18 rillig Exp $	*/
+/*	$NetBSD: indent.h,v 1.5 2021/03/07 11:32:05 rillig Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
@@ -30,7 +30,7 @@
 
 #if 0
 #if defined(__NetBSD__)
-__RCSID("$NetBSD: indent.h,v 1.4 2021/03/07 10:56:18 rillig Exp $");
+__RCSID("$NetBSD: indent.h,v 1.5 2021/03/07 11:32:05 rillig Exp $");
 #elif defined(__FreeBSD__)
 __FBSDID("$FreeBSD: head/usr.bin/indent/indent.h 336333 2018-07-16 05:46:50Z pstef $");
 #endif
@@ -50,6 +50,9 @@ int	compute_label_target(void);
 int	count_spaces(int, char *);
 int	count_spaces_until(int, char *, char *);
 void	init_constant_tt(void);
+#ifdef debug
+const char *token_type_name(token_type);
+#endif
 token_type lexi(struct parser_state *);
 void	diag(int, const char *, ...) __printflike(2, 3);
 void	dump_line(void);

Index: src/usr.bin/indent/lexi.c
diff -u src/usr.bin/indent/lexi.c:1.19 src/usr.bin/indent/lexi.c:1.20
--- src/usr.bin/indent/lexi.c:1.19	Sun Mar  7 10:56:18 2021
+++ src/usr.bin/indent/lexi.c	Sun Mar  7 11:32:05 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: lexi.c,v 1.19 2021/03/07 10:56:18 rillig Exp $	*/
+/*	$NetBSD: lexi.c,v 1.20 2021/03/07 11:32:05 rillig Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
@@ -46,7 +46,7 @@ static char sccsid[] = "@(#)lexi.c	8.1 (
 #include <sys/cdefs.h>
 #ifndef lint
 #if defined(__NetBSD__)
-__RCSID("$NetBSD: lexi.c,v 1.19 2021/03/07 10:56:18 rillig Exp $");
+__RCSID("$NetBSD: lexi.c,v 1.20 2021/03/07 11:32:05 rillig Exp $");
 #elif defined(__FreeBSD__)
 __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
 #endif
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD: head/usr.bin/indent/
  * of token scanned.
  */
 
+#include <assert.h>
 #include <err.h>
 #include <stdio.h>
 #include <ctype.h>
@@ -176,6 +177,62 @@ strcmp_type(const void *e1, const void *
     return (strcmp(e1, *(const char * const *)e2));
 }
 
+#ifdef debug
+const char *
+token_type_name(token_type tk)
+{
+    static const char *const name[] = {
+	"end_of_file", "newline", "lparen", "rparen", "unary_op",
+	"binary_op", "postop", "question", "casestmt", "colon",
+	"semicolon", "lbrace", "rbrace", "ident", "comma",
+	"comment", "swstmt", "preesc", "form_feed", "decl",
+	"sp_paren", "sp_nparen", "ifstmt", "whilestmt", "forstmt",
+	"stmt", "stmtl", "elselit", "dolit", "dohead",
+	"ifhead", "elsehead", "period", "strpfx", "storage",
+	"funcname", "type_def", "structure"
+    };
+
+    assert(0 <= tk && tk < sizeof name / sizeof name[0]);
+
+    return name[tk];
+}
+
+static void
+print_buf(const char *name, const char *s, const char *e)
+{
+    if (s == e)
+	return;
+
+    printf(" %s \"", name);
+    for (const char *p = s; p < e; p++) {
+	if (isprint((unsigned char)*p) && *p != '\\' && *p != '"')
+	    printf("%c", *p);
+	else if (*p == '\n')
+	    printf("\\n");
+	else if (*p == '\t')
+	    printf("\\t");
+	else
+	    printf("\\x%02x", *p);
+    }
+    printf("\"");
+}
+
+static token_type
+lexi_end(token_type code)
+{
+    printf("in line %d, lexi returns '%s'", line_no, token_type_name(code));
+    print_buf("token", s_token, e_token);
+    print_buf("label", s_lab, e_lab);
+    print_buf("code", s_code, e_code);
+    print_buf("comment", s_com, e_com);
+    printf("\n");
+
+    return code;
+}
+#else
+#  define lexi_end(tk) (tk)
+#endif
+
 token_type
 lexi(struct parser_state *state)
 {
@@ -250,7 +307,7 @@ lexi(struct parser_state *state)
 
 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
 	      (*buf_ptr == '"' || *buf_ptr == '\''))
-	    return (strpfx);
+	    return lexi_end(strpfx);
 
 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
 	    if (++buf_ptr >= buf_end)
@@ -262,7 +319,7 @@ lexi(struct parser_state *state)
 				 * in parentheses, then this token
 				 * should be treated as a declaration */
 	    state->last_u_d = true;
-	    return (decl);
+	    return lexi_end(decl);
 	}
 	/*
 	 * Operator after identifier is binary unless last token was 'struct'
@@ -291,9 +348,9 @@ lexi(struct parser_state *state)
 	    state->last_u_d = true;
 	    switch (p->rwcode) {
 	    case 7:		/* it is a switch */
-		return (swstmt);
+		return lexi_end(swstmt);
 	    case 8:		/* a case or default */
-		return (casestmt);
+		return lexi_end(casestmt);
 
 	    case 3:		/* a "struct" */
 		/* FALLTHROUGH */
@@ -308,26 +365,26 @@ lexi(struct parser_state *state)
 		    break;
 		}
 		if (p != NULL && p->rwcode == 3)
-		    return (structure);
+		    return lexi_end(structure);
 		if (state->p_l_follow)
 		    break;
-		return (decl);
+		return lexi_end(decl);
 
 	    case 5:		/* if, while, for */
-		return (sp_paren);
+		return lexi_end(sp_paren);
 
 	    case 6:		/* do, else */
-		return (sp_nparen);
+		return lexi_end(sp_nparen);
 
 	    case 10:		/* storage class specifier */
-		return (storage);
+		return lexi_end(storage);
 
 	    case 11:		/* typedef */
-		return (type_def);
+		return lexi_end(type_def);
 
 	    default:		/* all others are treated like any other
 				 * identifier */
-		return (ident);
+		return lexi_end(ident);
 	    }			/* end of switch */
 	}			/* end of if (found_it) */
 	if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 &&
@@ -339,7 +396,7 @@ lexi(struct parser_state *state)
 	    strncpy(state->procname, token, sizeof state->procname - 1);
 	    if (state->in_decl)
 		state->in_parameter_declaration = 1;
-	    return (funcname);
+	    return lexi_end(funcname);
     not_proc:;
 	}
 	/*
@@ -355,12 +412,12 @@ lexi(struct parser_state *state)
 		state->last_token == rbrace)) {
 	    state->keyword = 4;	/* a type name */
 	    state->last_u_d = true;
-	    return decl;
+	    return lexi_end(decl);
 	}
 	if (state->last_token == decl)	/* if this is a declared variable,
 					 * then following sign is unary */
 	    state->last_u_d = true;	/* will make "int a -1" work */
-	return (ident);		/* the ident is not in the list */
+	return lexi_end(ident);		/* the ident is not in the list */
     }				/* end of procesing for alpanum character */
 
     /* Scan a non-alphanumeric token */
@@ -594,7 +651,7 @@ stop_lit:
     state->last_u_d = unary_delim;
     CHECK_SIZE_TOKEN(1);
     *e_token = '\0';		/* null terminate the token */
-    return (code);
+    return lexi_end(code);
 }
 
 /* Initialize constant transition table */

Index: src/usr.bin/indent/parse.c
diff -u src/usr.bin/indent/parse.c:1.13 src/usr.bin/indent/parse.c:1.14
--- src/usr.bin/indent/parse.c:1.13	Sun Mar  7 10:56:18 2021
+++ src/usr.bin/indent/parse.c	Sun Mar  7 11:32:05 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: parse.c,v 1.13 2021/03/07 10:56:18 rillig Exp $	*/
+/*	$NetBSD: parse.c,v 1.14 2021/03/07 11:32:05 rillig Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
@@ -65,7 +65,7 @@ parse(token_type tk) /* tk: the code for
     int         i;
 
 #ifdef debug
-    printf("%2d - %s\n", tk, token);
+    printf("parse token: '%s' \"%s\"\n", token_type_name(tk), token);
 #endif
 
     while (ps.p_stack[ps.tos] == ifhead && tk != elselit) {
@@ -223,12 +223,13 @@ parse(token_type tk) /* tk: the code for
     reduce();			/* see if any reduction can be done */
 
 #ifdef debug
+    printf("parse stack:");
     for (i = 1; i <= ps.tos; ++i)
-	printf("(%d %d)", ps.p_stack[i], ps.il[i]);
+	printf(" ('%s' at %d)", token_type_name(ps.p_stack[i]), ps.il[i]);
+    if (ps.tos == 0)
+        printf(" empty");
     printf("\n");
 #endif
-
-    return;
 }
 
 /*

Reply via email to