https://github.com/python/cpython/commit/374abded070b861cc389d509937344073193c36a
commit: 374abded070b861cc389d509937344073193c36a
branch: main
author: Tomas R. <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2025-02-11T13:51:42+02:00
summary:
gh-104400: pygettext: use an AST parser instead of a tokenizer (GH-104402)
This greatly simplifies the code and fixes many corner cases.
files:
A Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst
M Lib/test/test_tools/i18n_data/docstrings.pot
M Lib/test/test_tools/i18n_data/docstrings.py
M Lib/test/test_tools/i18n_data/messages.pot
M Lib/test/test_tools/i18n_data/messages.py
M Lib/test/test_tools/test_i18n.py
M Tools/i18n/pygettext.py
diff --git a/Lib/test/test_tools/i18n_data/docstrings.pot
b/Lib/test/test_tools/i18n_data/docstrings.pot
index 5af1d41422ff62..387db2413a575f 100644
--- a/Lib/test/test_tools/i18n_data/docstrings.pot
+++ b/Lib/test/test_tools/i18n_data/docstrings.pot
@@ -15,26 +15,40 @@ msgstr ""
"Generated-By: pygettext.py 1.5\n"
-#: docstrings.py:7
+#: docstrings.py:1
+#, docstring
+msgid "Module docstring"
+msgstr ""
+
+#: docstrings.py:9
#, docstring
msgid ""
msgstr ""
-#: docstrings.py:18
+#: docstrings.py:15
+#, docstring
+msgid "docstring"
+msgstr ""
+
+#: docstrings.py:20
#, docstring
msgid ""
"multiline\n"
-" docstring\n"
-" "
+"docstring"
msgstr ""
-#: docstrings.py:25
+#: docstrings.py:27
#, docstring
msgid "docstring1"
msgstr ""
-#: docstrings.py:30
+#: docstrings.py:38
+#, docstring
+msgid "nested docstring"
+msgstr ""
+
+#: docstrings.py:43
#, docstring
-msgid "Hello, {}!"
+msgid "nested class docstring"
msgstr ""
diff --git a/Lib/test/test_tools/i18n_data/docstrings.py
b/Lib/test/test_tools/i18n_data/docstrings.py
index 85d7f159d37775..151a55a4b56ba6 100644
--- a/Lib/test/test_tools/i18n_data/docstrings.py
+++ b/Lib/test/test_tools/i18n_data/docstrings.py
@@ -1,3 +1,5 @@
+"""Module docstring"""
+
# Test docstring extraction
from gettext import gettext as _
@@ -10,10 +12,10 @@ def test(x):
# Leading empty line
def test2(x):
- """docstring""" # XXX This should be extracted but isn't.
+ """docstring"""
-# XXX Multiline docstrings should be cleaned with `inspect.cleandoc`.
+# Multiline docstrings are cleaned with `inspect.cleandoc`.
def test3(x):
"""multiline
docstring
@@ -27,15 +29,15 @@ def test4(x):
def test5(x):
- """Hello, {}!""".format("world!") # XXX This should not be extracted.
+ """Hello, {}!""".format("world!") # This should not be extracted.
# Nested docstrings
def test6(x):
def inner(y):
- """nested docstring""" # XXX This should be extracted but isn't.
+ """nested docstring"""
class Outer:
class Inner:
- "nested class docstring" # XXX This should be extracted but isn't.
+ "nested class docstring"
diff --git a/Lib/test/test_tools/i18n_data/messages.pot
b/Lib/test/test_tools/i18n_data/messages.pot
index 8d66fbc4f3a937..e8167acfc0742b 100644
--- a/Lib/test/test_tools/i18n_data/messages.pot
+++ b/Lib/test/test_tools/i18n_data/messages.pot
@@ -19,22 +19,22 @@ msgstr ""
msgid ""
msgstr ""
-#: messages.py:19 messages.py:20
+#: messages.py:19 messages.py:20 messages.py:21
msgid "parentheses"
msgstr ""
-#: messages.py:23
+#: messages.py:24
msgid "Hello, world!"
msgstr ""
-#: messages.py:26
+#: messages.py:27
msgid ""
"Hello,\n"
" multiline!\n"
msgstr ""
#: messages.py:46 messages.py:89 messages.py:90 messages.py:93 messages.py:94
-#: messages.py:99
+#: messages.py:99 messages.py:100 messages.py:101
msgid "foo"
msgid_plural "foos"
msgstr[0] ""
@@ -68,7 +68,7 @@ msgstr ""
msgid "set"
msgstr ""
-#: messages.py:63
+#: messages.py:62 messages.py:63
msgid "nested string"
msgstr ""
@@ -76,6 +76,10 @@ msgstr ""
msgid "baz"
msgstr ""
+#: messages.py:71 messages.py:75
+msgid "default value"
+msgstr ""
+
#: messages.py:91 messages.py:92 messages.py:95 messages.py:96
msgctxt "context"
msgid "foo"
@@ -83,7 +87,13 @@ msgid_plural "foos"
msgstr[0] ""
msgstr[1] ""
-#: messages.py:100
+#: messages.py:102
msgid "domain foo"
msgstr ""
+#: messages.py:118 messages.py:119
+msgid "world"
+msgid_plural "worlds"
+msgstr[0] ""
+msgstr[1] ""
+
diff --git a/Lib/test/test_tools/i18n_data/messages.py
b/Lib/test/test_tools/i18n_data/messages.py
index 1e03f4e556830d..9457bcb8611020 100644
--- a/Lib/test/test_tools/i18n_data/messages.py
+++ b/Lib/test/test_tools/i18n_data/messages.py
@@ -18,6 +18,7 @@
# Extra parentheses
(_("parentheses"))
((_("parentheses")))
+_(("parentheses"))
# Multiline strings
_("Hello, "
@@ -32,7 +33,6 @@
_(None)
_(1)
_(False)
-_(("invalid"))
_(["invalid"])
_({"invalid"})
_("string"[3])
@@ -40,7 +40,7 @@
_({"string": "foo"})
# pygettext does not allow keyword arguments, but both xgettext and pybabel do
-_(x="kwargs work!")
+_(x="kwargs are not allowed!")
# Unusual, but valid arguments
_("foo", "bar")
@@ -48,7 +48,7 @@
# .format()
_("Hello, {}!").format("world") # valid
-_("Hello, {}!".format("world")) # invalid, but xgettext and pybabel extract
the first string
+_("Hello, {}!".format("world")) # invalid, but xgettext extracts the first
string
# Nested structures
_("1"), _("2")
@@ -59,7 +59,7 @@
# Nested functions and classes
def test():
- _("nested string") # XXX This should be extracted but isn't.
+ _("nested string")
[_("nested string")]
@@ -68,11 +68,11 @@ def bar(self):
return _("baz")
-def bar(x=_('default value')): # XXX This should be extracted but isn't.
+def bar(x=_('default value')):
pass
-def baz(x=[_('default value')]): # XXX This should be extracted but isn't.
+def baz(x=[_('default value')]):
pass
@@ -97,6 +97,8 @@ def _(x="don't extract me"):
# Complex arguments
ngettext("foo", "foos", 42 + (10 - 20))
+ngettext("foo", "foos", *args)
+ngettext("foo", "foos", **kwargs)
dgettext(["some", {"complex"}, ("argument",)], "domain foo")
# Invalid calls which are not extracted
@@ -108,3 +110,10 @@ def _(x="don't extract me"):
dngettext('domain', 'foo')
dpgettext('domain', 'context')
dnpgettext('domain', 'context', 'foo')
+dgettext(*args, 'foo')
+dpgettext(*args, 'context', 'foo')
+dnpgettext(*args, 'context', 'foo', 'foos')
+
+# f-strings
+f"Hello, {_('world')}!"
+f"Hello, {ngettext('world', 'worlds', 3)}!"
diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py
index 29c3423e234d20..d23479104d438d 100644
--- a/Lib/test/test_tools/test_i18n.py
+++ b/Lib/test/test_tools/test_i18n.py
@@ -87,7 +87,7 @@ def assert_POT_equal(self, expected, actual):
self.maxDiff = None
self.assertEqual(normalize_POT_file(expected),
normalize_POT_file(actual))
- def extract_from_str(self, module_content, *, args=(), strict=True):
+ def extract_from_str(self, module_content, *, args=(), strict=True,
with_stderr=False):
"""Return all msgids extracted from module_content."""
filename = 'test.py'
with temp_cwd(None):
@@ -98,12 +98,18 @@ def extract_from_str(self, module_content, *, args=(),
strict=True):
self.assertEqual(res.err, b'')
with open('messages.pot', encoding='utf-8') as fp:
data = fp.read()
- return self.get_msgids(data)
+ msgids = self.get_msgids(data)
+ if not with_stderr:
+ return msgids
+ return msgids, res.err
def extract_docstrings_from_str(self, module_content):
"""Return all docstrings extracted from module_content."""
return self.extract_from_str(module_content, args=('--docstrings',),
strict=False)
+ def get_stderr(self, module_content):
+ return self.extract_from_str(module_content, strict=False,
with_stderr=True)[1]
+
def test_header(self):
"""Make sure the required fields are in the header, according to:
http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry
@@ -407,6 +413,24 @@ def test_files_list(self):
self.assertIn(f'msgid "{text2}"', data)
self.assertNotIn(text3, data)
+ def test_error_messages(self):
+ """Test that pygettext outputs error messages to stderr."""
+ stderr = self.get_stderr(dedent('''\
+ _(1+2)
+ ngettext('foo')
+ dgettext(*args, 'foo')
+ '''))
+
+ # Normalize line endings on Windows
+ stderr = stderr.decode('utf-8').replace('\r', '')
+
+ self.assertEqual(
+ stderr,
+ "*** test.py:1: Expected a string constant for argument 1, got 1 +
2\n"
+ "*** test.py:2: Expected at least 2 positional argument(s) in
gettext call, got 1\n"
+ "*** test.py:3: Variable positional arguments are not allowed in
gettext calls\n"
+ )
+
def update_POT_snapshots():
for input_file in DATA_DIR.glob('*.py'):
diff --git
a/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst
b/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst
new file mode 100644
index 00000000000000..82954e170ec3e2
--- /dev/null
+++
b/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst
@@ -0,0 +1 @@
+Fix several bugs in extraction by switching to an AST parser in
:program:`pygettext`.
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
index d8a0e379ab82cb..4720aecbdc515a 100755
--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py
@@ -140,8 +140,6 @@
import os
import sys
import time
-import tokenize
-from collections import defaultdict
from dataclasses import dataclass, field
from operator import itemgetter
@@ -206,15 +204,6 @@ def escape_nonascii(s, encoding):
return ''.join(escapes[b] for b in s.encode(encoding))
-def is_literal_string(s):
- return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"')
-
-
-def safe_eval(s):
- # unwrap quotes, safely
- return eval(s, {'__builtins__':{}}, {})
-
-
def normalize(s, encoding):
# This converts the various Python string types into a format that is
# appropriate for .po files, namely much closer to C style.
@@ -296,11 +285,6 @@ def getFilesForName(name):
}
-def matches_spec(message, spec):
- """Check if a message has all the keys defined by the keyword spec."""
- return all(key in message for key in spec.values())
-
-
@dataclass(frozen=True)
class Location:
filename: str
@@ -325,203 +309,91 @@ def add_location(self, filename, lineno,
msgid_plural=None, *, is_docstring=Fals
self.is_docstring |= is_docstring
-class TokenEater:
+class GettextVisitor(ast.NodeVisitor):
def __init__(self, options):
- self.__options = options
- self.__messages = {}
- self.__state = self.__waiting
- self.__data = defaultdict(str)
- self.__curr_arg = 0
- self.__curr_keyword = None
- self.__lineno = -1
- self.__freshmodule = 1
- self.__curfile = None
- self.__enclosurecount = 0
-
- def __call__(self, ttype, tstring, stup, etup, line):
- # dispatch
-## import token
-## print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
-## file=sys.stderr)
- self.__state(ttype, tstring, stup[0])
-
- @property
- def messages(self):
- return self.__messages
-
- def __waiting(self, ttype, tstring, lineno):
- opts = self.__options
- # Do docstring extractions, if enabled
- if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
- # module docstring?
- if self.__freshmodule:
- if ttype == tokenize.STRING and is_literal_string(tstring):
- self.__addentry({'msgid': safe_eval(tstring)}, lineno,
is_docstring=True)
- self.__freshmodule = 0
- return
- if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING):
- return
- self.__freshmodule = 0
- # class or func/method docstring?
- if ttype == tokenize.NAME and tstring in ('class', 'def'):
- self.__state = self.__suiteseen
- return
- if ttype == tokenize.NAME and tstring in ('class', 'def'):
- self.__state = self.__ignorenext
+ super().__init__()
+ self.options = options
+ self.filename = None
+ self.messages = {}
+
+ def visit_file(self, node, filename):
+ self.filename = filename
+ self.visit(node)
+
+ def visit_Module(self, node):
+ self._extract_docstring(node)
+ self.generic_visit(node)
+
+ visit_ClassDef = visit_FunctionDef = visit_AsyncFunctionDef = visit_Module
+
+ def visit_Call(self, node):
+ self._extract_message(node)
+ self.generic_visit(node)
+
+ def _extract_docstring(self, node):
+ if (not self.options.docstrings or
+ self.options.nodocstrings.get(self.filename)):
return
- if ttype == tokenize.NAME and tstring in opts.keywords:
- self.__state = self.__keywordseen
- self.__curr_keyword = tstring
+
+ docstring = ast.get_docstring(node)
+ if docstring is not None:
+ lineno = node.body[0].lineno # The first statement is the
docstring
+ self._add_message(lineno, docstring, is_docstring=True)
+
+ def _extract_message(self, node):
+ func_name = self._get_func_name(node)
+ spec = self.options.keywords.get(func_name)
+ if spec is None:
+ return
+
+ max_index = max(spec)
+ has_var_positional = any(isinstance(arg, ast.Starred) for
+ arg in node.args[:max_index+1])
+ if has_var_positional:
+ print(f'*** {self.filename}:{node.lineno}: Variable positional '
+ f'arguments are not allowed in gettext calls',
file=sys.stderr)
return
- if ttype == tokenize.STRING:
- maybe_fstring = ast.parse(tstring, mode='eval').body
- if not isinstance(maybe_fstring, ast.JoinedStr):
- return
- for value in filter(lambda node: isinstance(node,
ast.FormattedValue),
- maybe_fstring.values):
- for call in filter(lambda node: isinstance(node, ast.Call),
- ast.walk(value)):
- func = call.func
- if isinstance(func, ast.Name):
- func_name = func.id
- elif isinstance(func, ast.Attribute):
- func_name = func.attr
- else:
- continue
-
- if func_name not in opts.keywords:
- continue
- if len(call.args) != 1:
- print((
- '*** %(file)s:%(lineno)s: Seen unexpected amount
of'
- ' positional arguments in gettext call:
%(source_segment)s'
- ) % {
- 'source_segment': ast.get_source_segment(tstring,
call) or tstring,
- 'file': self.__curfile,
- 'lineno': lineno
- }, file=sys.stderr)
- continue
- if call.keywords:
- print((
- '*** %(file)s:%(lineno)s: Seen unexpected keyword
arguments'
- ' in gettext call: %(source_segment)s'
- ) % {
- 'source_segment': ast.get_source_segment(tstring,
call) or tstring,
- 'file': self.__curfile,
- 'lineno': lineno
- }, file=sys.stderr)
- continue
- arg = call.args[0]
- if not isinstance(arg, ast.Constant):
- print((
- '*** %(file)s:%(lineno)s: Seen unexpected argument
type'
- ' in gettext call: %(source_segment)s'
- ) % {
- 'source_segment': ast.get_source_segment(tstring,
call) or tstring,
- 'file': self.__curfile,
- 'lineno': lineno
- }, file=sys.stderr)
- continue
- if isinstance(arg.value, str):
- self.__curr_keyword = func_name
- self.__addentry({'msgid': arg.value}, lineno)
-
- def __suiteseen(self, ttype, tstring, lineno):
- # skip over any enclosure pairs until we see the colon
- if ttype == tokenize.OP:
- if tstring == ':' and self.__enclosurecount == 0:
- # we see a colon and we're not in an enclosure: end of def
- self.__state = self.__suitedocstring
- elif tstring in '([{':
- self.__enclosurecount += 1
- elif tstring in ')]}':
- self.__enclosurecount -= 1
-
- def __suitedocstring(self, ttype, tstring, lineno):
- # ignore any intervening noise
- if ttype == tokenize.STRING and is_literal_string(tstring):
- self.__addentry({'msgid': safe_eval(tstring)}, lineno,
is_docstring=True)
- self.__state = self.__waiting
- elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
- tokenize.COMMENT):
- # there was no class docstring
- self.__state = self.__waiting
-
- def __keywordseen(self, ttype, tstring, lineno):
- if ttype == tokenize.OP and tstring == '(':
- self.__data.clear()
- self.__curr_arg = 0
- self.__enclosurecount = 0
- self.__lineno = lineno
- self.__state = self.__openseen
- else:
- self.__state = self.__waiting
-
- def __openseen(self, ttype, tstring, lineno):
- spec = self.__options.keywords[self.__curr_keyword]
- arg_type = spec.get(self.__curr_arg)
- expect_string_literal = arg_type is not None
-
- if ttype == tokenize.OP and self.__enclosurecount == 0:
- if tstring == ')':
- # We've seen the last of the translatable strings. Record the
- # line number of the first line of the strings and update the
list
- # of messages seen. Reset state for the next batch. If there
- # were no strings inside _(), then just ignore this entry.
- if self.__data:
- self.__addentry(self.__data)
- self.__state = self.__waiting
- return
- elif tstring == ',':
- # Advance to the next argument
- self.__curr_arg += 1
- return
- if expect_string_literal:
- if ttype == tokenize.STRING and is_literal_string(tstring):
- self.__data[arg_type] += safe_eval(tstring)
- elif ttype not in (tokenize.COMMENT, tokenize.INDENT,
tokenize.DEDENT,
- tokenize.NEWLINE, tokenize.NL):
- # We are inside an argument which is a translatable string and
- # we encountered a token that is not a string. This is an
error.
- self.warn_unexpected_token(tstring)
- self.__enclosurecount = 0
- self.__state = self.__waiting
- elif ttype == tokenize.OP:
- if tstring in '([{':
- self.__enclosurecount += 1
- elif tstring in ')]}':
- self.__enclosurecount -= 1
-
- def __ignorenext(self, ttype, tstring, lineno):
- self.__state = self.__waiting
-
- def __addentry(self, msg, lineno=None, *, is_docstring=False):
- msgid = msg.get('msgid')
- if msgid in self.__options.toexclude:
+ if max_index >= len(node.args):
+ print(f'*** {self.filename}:{node.lineno}: Expected at least '
+ f'{max(spec) + 1} positional argument(s) in gettext call, '
+ f'got {len(node.args)}', file=sys.stderr)
return
- if not is_docstring:
- spec = self.__options.keywords[self.__curr_keyword]
- if not matches_spec(msg, spec):
+
+ msg_data = {}
+ for position, arg_type in spec.items():
+ arg = node.args[position]
+ if not self._is_string_const(arg):
+ print(f'*** {self.filename}:{arg.lineno}: Expected a string '
+ f'constant for argument {position + 1}, '
+ f'got {ast.unparse(arg)}', file=sys.stderr)
return
- if lineno is None:
- lineno = self.__lineno
- msgctxt = msg.get('msgctxt')
- msgid_plural = msg.get('msgid_plural')
+ msg_data[arg_type] = arg.value
+
+ lineno = node.lineno
+ self._add_message(lineno, **msg_data)
+
+ def _add_message(
+ self, lineno, msgid, msgid_plural=None, msgctxt=None, *,
+ is_docstring=False):
+ if msgid in self.options.toexclude:
+ return
+
key = self._key_for(msgid, msgctxt)
- if key in self.__messages:
- self.__messages[key].add_location(
- self.__curfile,
+ message = self.messages.get(key)
+ if message:
+ message.add_location(
+ self.filename,
lineno,
msgid_plural,
is_docstring=is_docstring,
)
else:
- self.__messages[key] = Message(
+ self.messages[key] = Message(
msgid=msgid,
msgid_plural=msgid_plural,
msgctxt=msgctxt,
- locations={Location(self.__curfile, lineno)},
+ locations={Location(self.filename, lineno)},
is_docstring=is_docstring,
)
@@ -531,19 +403,17 @@ def _key_for(msgid, msgctxt=None):
return (msgctxt, msgid)
return msgid
- def warn_unexpected_token(self, token):
- print((
- '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
- ) % {
- 'token': token,
- 'file': self.__curfile,
- 'lineno': self.__lineno
- }, file=sys.stderr)
-
- def set_filename(self, filename):
- self.__curfile = filename
- self.__freshmodule = 1
+ def _get_func_name(self, node):
+ match node.func:
+ case ast.Name(id=id):
+ return id
+ case ast.Attribute(attr=attr):
+ return attr
+ case _:
+ return None
+ def _is_string_const(self, node):
+ return isinstance(node, ast.Constant) and isinstance(node.value, str)
def write_pot_file(messages, options, fp):
timestamp = time.strftime('%Y-%m-%d %H:%M%z')
@@ -717,31 +587,24 @@ class Options:
args = expanded
# slurp through all the files
- eater = TokenEater(options)
+ visitor = GettextVisitor(options)
for filename in args:
if filename == '-':
if options.verbose:
print('Reading standard input')
- fp = sys.stdin.buffer
- closep = 0
+ source = sys.stdin.buffer.read()
else:
if options.verbose:
print(f'Working on {filename}')
- fp = open(filename, 'rb')
- closep = 1
+ with open(filename, 'rb') as fp:
+ source = fp.read()
+
try:
- eater.set_filename(filename)
- try:
- tokens = tokenize.tokenize(fp.readline)
- for _token in tokens:
- eater(*_token)
- except tokenize.TokenError as e:
- print('%s: %s, line %d, column %d' % (
- e.args[0], filename, e.args[1][0], e.args[1][1]),
- file=sys.stderr)
- finally:
- if closep:
- fp.close()
+ module_tree = ast.parse(source)
+ except SyntaxError:
+ continue
+
+ visitor.visit_file(module_tree, filename)
# write the output
if options.outfile == '-':
@@ -753,7 +616,7 @@ class Options:
fp = open(options.outfile, 'w')
closep = 1
try:
- write_pot_file(eater.messages, options, fp)
+ write_pot_file(visitor.messages, options, fp)
finally:
if closep:
fp.close()
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]