https://github.com/python/cpython/commit/4bced29a74c4dbcf310a47e8202292aaa39b617b commit: 4bced29a74c4dbcf310a47e8202292aaa39b617b branch: main author: Petr Viktorin <encu...@gmail.com> committer: encukou <encu...@gmail.com> date: 2025-03-19T16:42:11+01:00 summary:
gh-130587: Add hand-written docs for non-OP tokens (GH-130588) Co-authored-by: Blaise Pabon <bla...@gmail.com> Co-authored-by: Adam Turner <9087854+aa-tur...@users.noreply.github.com> Co-authored-by: Lysandros Nikolaou <lisandros...@gmail.com> files: M Doc/library/token-list.inc M Doc/library/token.rst M Doc/reference/toplevel_components.rst M Tools/build/generate_token.py diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc index 39df2927a0b7f2..655758c4a400cc 100644 --- a/Doc/library/token-list.inc +++ b/Doc/library/token-list.inc @@ -1,230 +1,104 @@ .. Auto-generated by Tools/build/generate_token.py -.. data:: ENDMARKER -.. data:: NAME - -.. data:: NUMBER - -.. data:: STRING - -.. data:: NEWLINE - -.. data:: INDENT - -.. data:: DEDENT - -.. data:: LPAR - - Token value for ``"("``. - -.. data:: RPAR - - Token value for ``")"``. - -.. data:: LSQB - - Token value for ``"["``. - -.. data:: RSQB - - Token value for ``"]"``. - -.. data:: COLON - - Token value for ``":"``. - -.. data:: COMMA - - Token value for ``","``. - -.. data:: SEMI - - Token value for ``";"``. - -.. data:: PLUS - - Token value for ``"+"``. - -.. data:: MINUS - - Token value for ``"-"``. - -.. data:: STAR - - Token value for ``"*"``. - -.. data:: SLASH - - Token value for ``"/"``. - -.. data:: VBAR - - Token value for ``"|"``. - -.. data:: AMPER - - Token value for ``"&"``. - -.. data:: LESS - - Token value for ``"<"``. - -.. data:: GREATER - - Token value for ``">"``. - -.. data:: EQUAL - - Token value for ``"="``. - -.. data:: DOT - - Token value for ``"."``. - -.. data:: PERCENT - - Token value for ``"%"``. - -.. data:: LBRACE - - Token value for ``"{"``. - -.. data:: RBRACE - - Token value for ``"}"``. - -.. data:: EQEQUAL - - Token value for ``"=="``. - -.. data:: NOTEQUAL - - Token value for ``"!="``. - -.. data:: LESSEQUAL - - Token value for ``"<="``. - -.. data:: GREATEREQUAL - - Token value for ``">="``. - -.. data:: TILDE - - Token value for ``"~"``. - -.. data:: CIRCUMFLEX - - Token value for ``"^"``. - -.. data:: LEFTSHIFT - - Token value for ``"<<"``. - -.. data:: RIGHTSHIFT - - Token value for ``">>"``. - -.. data:: DOUBLESTAR - - Token value for ``"**"``. - -.. data:: PLUSEQUAL - - Token value for ``"+="``. - -.. data:: MINEQUAL - - Token value for ``"-="``. - -.. data:: STAREQUAL - - Token value for ``"*="``. - -.. data:: SLASHEQUAL - - Token value for ``"/="``. - -.. data:: PERCENTEQUAL - - Token value for ``"%="``. - -.. data:: AMPEREQUAL - - Token value for ``"&="``. - -.. data:: VBAREQUAL - - Token value for ``"|="``. - -.. data:: CIRCUMFLEXEQUAL - - Token value for ``"^="``. - -.. data:: LEFTSHIFTEQUAL - - Token value for ``"<<="``. - -.. data:: RIGHTSHIFTEQUAL - - Token value for ``">>="``. - -.. data:: DOUBLESTAREQUAL - - Token value for ``"**="``. - -.. data:: DOUBLESLASH - - Token value for ``"//"``. - -.. data:: DOUBLESLASHEQUAL - - Token value for ``"//="``. - -.. data:: AT - - Token value for ``"@"``. - -.. data:: ATEQUAL - - Token value for ``"@="``. - -.. data:: RARROW - - Token value for ``"->"``. - -.. data:: ELLIPSIS - - Token value for ``"..."``. - -.. data:: COLONEQUAL - - Token value for ``":="``. - -.. data:: EXCLAMATION - - Token value for ``"!"``. - -.. data:: OP - -.. data:: TYPE_IGNORE - -.. data:: TYPE_COMMENT - -.. data:: SOFT_KEYWORD - -.. data:: FSTRING_START - -.. data:: FSTRING_MIDDLE - -.. data:: FSTRING_END - -.. data:: COMMENT - -.. data:: NL - -.. data:: ERRORTOKEN - -.. data:: N_TOKENS - -.. data:: NT_OFFSET +.. list-table:: + :align: left + :header-rows: 1 + + * - Token + - Value + * - .. data:: LPAR + - ``"("`` + * - .. data:: RPAR + - ``")"`` + * - .. data:: LSQB + - ``"["`` + * - .. data:: RSQB + - ``"]"`` + * - .. data:: COLON + - ``":"`` + * - .. data:: COMMA + - ``","`` + * - .. data:: SEMI + - ``";"`` + * - .. data:: PLUS + - ``"+"`` + * - .. data:: MINUS + - ``"-"`` + * - .. data:: STAR + - ``"*"`` + * - .. data:: SLASH + - ``"/"`` + * - .. data:: VBAR + - ``"|"`` + * - .. data:: AMPER + - ``"&"`` + * - .. data:: LESS + - ``"<"`` + * - .. data:: GREATER + - ``">"`` + * - .. data:: EQUAL + - ``"="`` + * - .. data:: DOT + - ``"."`` + * - .. data:: PERCENT + - ``"%"`` + * - .. data:: LBRACE + - ``"{"`` + * - .. data:: RBRACE + - ``"}"`` + * - .. data:: EQEQUAL + - ``"=="`` + * - .. data:: NOTEQUAL + - ``"!="`` + * - .. data:: LESSEQUAL + - ``"<="`` + * - .. data:: GREATEREQUAL + - ``">="`` + * - .. data:: TILDE + - ``"~"`` + * - .. data:: CIRCUMFLEX + - ``"^"`` + * - .. data:: LEFTSHIFT + - ``"<<"`` + * - .. data:: RIGHTSHIFT + - ``">>"`` + * - .. data:: DOUBLESTAR + - ``"**"`` + * - .. data:: PLUSEQUAL + - ``"+="`` + * - .. data:: MINEQUAL + - ``"-="`` + * - .. data:: STAREQUAL + - ``"*="`` + * - .. data:: SLASHEQUAL + - ``"/="`` + * - .. data:: PERCENTEQUAL + - ``"%="`` + * - .. data:: AMPEREQUAL + - ``"&="`` + * - .. data:: VBAREQUAL + - ``"|="`` + * - .. data:: CIRCUMFLEXEQUAL + - ``"^="`` + * - .. data:: LEFTSHIFTEQUAL + - ``"<<="`` + * - .. data:: RIGHTSHIFTEQUAL + - ``">>="`` + * - .. data:: DOUBLESTAREQUAL + - ``"**="`` + * - .. data:: DOUBLESLASH + - ``"//"`` + * - .. data:: DOUBLESLASHEQUAL + - ``"//="`` + * - .. data:: AT + - ``"@"`` + * - .. data:: ATEQUAL + - ``"@="`` + * - .. data:: RARROW + - ``"->"`` + * - .. data:: ELLIPSIS + - ``"..."`` + * - .. data:: COLONEQUAL + - ``":="`` + * - .. data:: EXCLAMATION + - ``"!"`` diff --git a/Doc/library/token.rst b/Doc/library/token.rst index 40982f32b4beee..24455b1ef77893 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -19,6 +19,10 @@ change between Python versions. The module also provides a mapping from numeric codes to names and some functions. The functions mirror definitions in the Python C header files. +Note that a token's value may depend on tokenizer options. For example, a +``"+"`` token may be reported as either :data:`PLUS` or :data:`OP`, or +a ``"match"`` token may be either :data:`NAME` or :data:`SOFT_KEYWORD`. + .. data:: tok_name @@ -44,25 +48,93 @@ functions. The functions mirror definitions in the Python C header files. The token constants are: -.. include:: token-list.inc +.. data:: NAME + + Token value that indicates an :ref:`identifier <identifiers>`. + Note that keywords are also initially tokenized an ``NAME`` tokens. + +.. data:: NUMBER + + Token value that indicates a :ref:`numeric literal <numbers>` + +.. data:: STRING + + Token value that indicates a :ref:`string or byte literal <strings>`, + excluding :ref:`formatted string literals <f-strings>`. + The token string is not interpreted: + it includes the surrounding quotation marks and the prefix (if given); + backslashes are included literally, without processing escape sequences. -The following token type values aren't used by the C tokenizer but are needed for -the :mod:`tokenize` module. +.. data:: OP + + A generic token value that indicates an + :ref:`operator <operators>` or :ref:`delimiter <delimiters>`. + + .. impl-detail:: + + This value is only reported by the :mod:`tokenize` module. + Internally, the tokenizer uses + :ref:`exact token types <token_operators_delimiters>` instead. .. data:: COMMENT - :noindex: Token value used to indicate a comment. + The parser ignores :data:`!COMMENT` tokens. +.. data:: NEWLINE + + Token value that indicates the end of a :ref:`logical line <logical-lines>`. .. data:: NL - :noindex: - Token value used to indicate a non-terminating newline. The - :data:`NEWLINE` token indicates the end of a logical line of Python code; - ``NL`` tokens are generated when a logical line of code is continued over - multiple physical lines. + Token value used to indicate a non-terminating newline. + :data:`!NL` tokens are generated when a logical line of code is continued + over multiple physical lines. The parser ignores :data:`!NL` tokens. + +.. data:: INDENT + + Token value used at the beginning of a :ref:`logical line <logical-lines>` + to indicate the start of an :ref:`indented block <indentation>`. + +.. data:: DEDENT + + Token value used at the beginning of a :ref:`logical line <logical-lines>` + to indicate the end of an :ref:`indented block <indentation>`. + +.. data:: FSTRING_START + + Token value used to indicate the beginning of an + :ref:`f-string literal <f-strings>`. + + .. impl-detail:: + + The token string includes the prefix and the opening quote(s), but none + of the contents of the literal. + +.. data:: FSTRING_MIDDLE + + Token value used for literal text inside an :ref:`f-string literal <f-strings>`, + including format specifications. + + .. impl-detail:: + + Replacement fields (that is, the non-literal parts of f-strings) use + the same tokens as other expressions, and are delimited by + :data:`LBRACE`, :data:`RBRACE`, :data:`EXCLAMATION` and :data:`COLON` + tokens. + +.. data:: FSTRING_END + + Token value used to indicate the end of a :ref:`f-string <f-strings>`. + .. impl-detail:: + + The token string contains the closing quote(s). + +.. data:: ENDMARKER + + Token value that indicates the end of input. + Used in :ref:`top-level grammar rules <top-level>`. .. data:: ENCODING @@ -70,14 +142,63 @@ the :mod:`tokenize` module. into text. The first token returned by :func:`tokenize.tokenize` will always be an ``ENCODING`` token. + .. impl-detail:: + + This token type isn't used by the C tokenizer but is needed for + the :mod:`tokenize` module. + + +The following token types are not produced by the :mod:`tokenize` module, +and are defined for special uses in the tokenizer or parser: + +.. data:: TYPE_IGNORE + + Token value indicating that a ``type: ignore`` comment was recognized. + Such tokens are produced instead of regular :data:`COMMENT` tokens only + with the :data:`~ast.PyCF_TYPE_COMMENTS` flag. .. data:: TYPE_COMMENT - :noindex: - Token value indicating that a type comment was recognized. Such - tokens are only produced when :func:`ast.parse` is invoked with - ``type_comments=True``. + Token value indicating that a type comment was recognized. + Such tokens are produced instead of regular :data:`COMMENT` tokens only + with the :data:`~ast.PyCF_TYPE_COMMENTS` flag. + +.. data:: SOFT_KEYWORD + + Token value indicating a :ref:`soft keyword <soft-keywords>`. + + The tokenizer never produces this value. + To check for a soft keyword, pass a :data:`NAME` token's string to + :func:`keyword.issoftkeyword`. + +.. data:: ERRORTOKEN + Token value used to indicate wrong input. + + The :mod:`tokenize` module generally indicates errors by + raising exceptions instead of emitting this token. + It can also emit tokens such as :data:`OP` or :data:`NAME` with strings that + are later rejected by the parser. + + +.. _token_operators_delimiters: + +The remaining tokens represent specific :ref:`operators <operators>` and +:ref:`delimiters <delimiters>`. +(The :mod:`tokenize` module reports these as :data:`OP`; see ``exact_type`` +in the :mod:`tokenize` documentation for details.) + +.. include:: token-list.inc + + +The following non-token constants are provided: + +.. data:: N_TOKENS + + The number of token types defined in this module. + +.. NT_OFFSET is deliberately undocumented; if you need it you should be + reading the source .. data:: EXACT_TOKEN_TYPES @@ -102,6 +223,9 @@ the :mod:`tokenize` module. to support parsing older Python versions for :func:`ast.parse` with ``feature_version`` set to 6 or lower). +.. versionchanged:: 3.12 + Added :data:`EXCLAMATION`. + .. versionchanged:: 3.13 Removed :data:`!AWAIT` and :data:`!ASYNC` tokens again. diff --git a/Doc/reference/toplevel_components.rst b/Doc/reference/toplevel_components.rst index f155fafbe4d738..bd64b1c08bd1ff 100644 --- a/Doc/reference/toplevel_components.rst +++ b/Doc/reference/toplevel_components.rst @@ -69,7 +69,7 @@ All input read from non-interactive files has the same form: .. grammar-snippet:: :group: python-grammar - file_input: (NEWLINE | `statement`)* + file_input: (NEWLINE | `statement`)* ENDMARKER This syntax is used in the following situations: @@ -90,7 +90,7 @@ Input in interactive mode is parsed using the following grammar: .. grammar-snippet:: :group: python-grammar - interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE + interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE | ENDMARKER Note that a (top-level) compound statement must be followed by a blank line in interactive mode; this is needed to help the parser detect the end of the input. @@ -107,5 +107,7 @@ Expression input :func:`eval` is used for expression input. It ignores leading whitespace. The string argument to :func:`eval` must have the following form: -.. productionlist:: python-grammar - eval_input: `expression_list` NEWLINE* +.. grammar-snippet:: + :group: python-grammar + + eval_input: `expression_list` NEWLINE* ENDMARKER diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py index d32747f19945d8..a5f9828c466eda 100755 --- a/Tools/build/generate_token.py +++ b/Tools/build/generate_token.py @@ -1,10 +1,17 @@ #! /usr/bin/env python3 # This script generates token related files from Grammar/Tokens: # -# Doc/library/token-list.inc -# Include/token.h -# Parser/token.c -# Lib/token.py +# make_rst: +# Doc/library/token-list.inc +# Doc/library/token.rst (checked, not generated) +# make_h: +# Include/token.h +# make_c: +# Parser/token.c +# make_py: +# Lib/token.py + +import re SCRIPT_NAME = 'Tools/build/generate_token.py' @@ -199,23 +206,51 @@ def make_c(infile, outfile='Parser/token.c'): token_inc_template = f"""\ .. {AUTO_GENERATED_BY_SCRIPT} -%s -.. data:: N_TOKENS -.. data:: NT_OFFSET +.. list-table:: + :align: left + :header-rows: 1 + + * - Token + - Value +%s """ -def make_rst(infile, outfile='Doc/library/token-list.inc'): +def make_rst(infile, outfile='Doc/library/token-list.inc', + rstfile='Doc/library/token.rst'): tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) tok_to_string = {value: s for s, value in string_to_tok.items()} + needs_handwritten_doc = set() + names = [] - for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): - names.append('.. data:: %s' % (name,)) + for value, name in enumerate(tok_names): if value in tok_to_string: - names.append('') - names.append(' Token value for ``"%s"``.' % tok_to_string[value]) - names.append('') + assert name.isupper() + names.append(f' * - .. data:: {name}') + names.append(f' - ``"{tok_to_string[value]}"``') + else: + needs_handwritten_doc.add(name) + + has_handwritten_doc = set() + with open(rstfile) as fileobj: + tokendef_re = re.compile(r'.. data:: ([0-9A-Z_]+)\s*') + for line in fileobj: + if match := tokendef_re.fullmatch(line): + has_handwritten_doc.add(match[1]) + + # Exclude non-token constants in token.py + has_handwritten_doc -= {'N_TOKENS', 'NT_OFFSET', 'EXACT_TOKEN_TYPES'} + + if needs_handwritten_doc != has_handwritten_doc: + message_parts = [f'ERROR: {rstfile} does not document all tokens!'] + undocumented = needs_handwritten_doc - has_handwritten_doc + extra = has_handwritten_doc - needs_handwritten_doc + if undocumented: + message_parts.append(f'Undocumented tokens: {undocumented}') + if extra: + message_parts.append(f'Documented nonexistent tokens: {extra}') + exit('\n'.join(message_parts)) if update_file(outfile, token_inc_template % '\n'.join(names)): print("%s regenerated from %s" % (outfile, infile)) _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com