https://github.com/python/cpython/commit/c23bdc2eced311671d77c5ac2cb450fe4eb1b08e commit: c23bdc2eced311671d77c5ac2cb450fe4eb1b08e branch: 3.14 author: Miss Islington (bot) <31488909+miss-isling...@users.noreply.github.com> committer: serhiy-storchaka <storch...@gmail.com> date: 2025-05-20T10:13:40Z summary:
[3.14] gh-133890: Handle UnicodeEncodeError in tarfile (GH-134147) (GH-134195) UnicodeEncodeError is now handled the same way as OSError during TarFile member extraction. (cherry picked from commit 9983c7d4416cac8deb2fded1ec9c7daf786c3a02) Co-authored-by: Serhiy Storchaka <storch...@gmail.com> files: A Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst M Lib/tarfile.py M Lib/test/test_tarfile.py diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 13889d768021b1..212b71f6509740 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -2439,7 +2439,7 @@ def _get_extract_tarinfo(self, member, filter_function, path): unfiltered = tarinfo try: tarinfo = filter_function(tarinfo, path) - except (OSError, FilterError) as e: + except (OSError, UnicodeEncodeError, FilterError) as e: self._handle_fatal_error(e) except ExtractError as e: self._handle_nonfatal_error(e) @@ -2460,7 +2460,7 @@ def _extract_one(self, tarinfo, path, set_attrs, numeric_owner): self._extract_member(tarinfo, os.path.join(path, tarinfo.name), set_attrs=set_attrs, numeric_owner=numeric_owner) - except OSError as e: + except (OSError, UnicodeEncodeError) as e: self._handle_fatal_error(e) except ExtractError as e: self._handle_nonfatal_error(e) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 2d9649237a9382..2018a20afd1b18 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -3490,11 +3490,12 @@ class ArchiveMaker: with t.open() as tar: ... # `tar` is now a TarFile with 'filename' in it! """ - def __init__(self): + def __init__(self, **kwargs): self.bio = io.BytesIO() + self.tar_kwargs = dict(kwargs) def __enter__(self): - self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio) + self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio, **self.tar_kwargs) return self def __exit__(self, *exc): @@ -4073,7 +4074,10 @@ def test_tar_filter(self): # that in the test archive.) with tarfile.TarFile.open(tarname) as tar: for tarinfo in tar.getmembers(): - filtered = tarfile.tar_filter(tarinfo, '') + try: + filtered = tarfile.tar_filter(tarinfo, '') + except UnicodeEncodeError: + continue self.assertIs(filtered.name, tarinfo.name) self.assertIs(filtered.type, tarinfo.type) @@ -4084,11 +4088,48 @@ def test_data_filter(self): for tarinfo in tar.getmembers(): try: filtered = tarfile.data_filter(tarinfo, '') - except tarfile.FilterError: + except (tarfile.FilterError, UnicodeEncodeError): continue self.assertIs(filtered.name, tarinfo.name) self.assertIs(filtered.type, tarinfo.type) + @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths') + def test_filter_unencodable(self): + # Sanity check using a valid path. + tarinfo = tarfile.TarInfo(os_helper.TESTFN) + filtered = tarfile.tar_filter(tarinfo, '') + self.assertIs(filtered.name, tarinfo.name) + filtered = tarfile.data_filter(tarinfo, '') + self.assertIs(filtered.name, tarinfo.name) + + tarinfo = tarfile.TarInfo('test\x00') + self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '') + self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '') + tarinfo = tarfile.TarInfo('\ud800') + self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '') + self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '') + + @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths') + def test_extract_unencodable(self): + # Create a member with name \xed\xa0\x80 which is UTF-8 encoded + # lone surrogate \ud800. + with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc: + arc.add('\udced\udca0\udc80') + with os_helper.temp_cwd() as tmp: + tar = arc.open(encoding='utf-8', errors='surrogatepass', + errorlevel=1) + self.assertEqual(tar.getnames(), ['\ud800']) + with self.assertRaises(UnicodeEncodeError): + tar.extractall() + self.assertEqual(os.listdir(), []) + + tar = arc.open(encoding='utf-8', errors='surrogatepass', + errorlevel=0, debug=1) + with support.captured_stderr() as stderr: + tar.extractall() + self.assertEqual(os.listdir(), []) + self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue()) + def test_change_default_filter_on_instance(self): tar = tarfile.TarFile(tarname, 'r') def strict_filter(tarinfo, path): diff --git a/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst new file mode 100644 index 00000000000000..44565a5424e65b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst @@ -0,0 +1,2 @@ +The :mod:`tarfile` module now handles :exc:`UnicodeEncodeError` in the same +way as :exc:`OSError` when cannot extract a member. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com