Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-charset-normalizer for openSUSE:Factory checked in at 2021-05-20 19:25:29 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-charset-normalizer (Old) and /work/SRC/openSUSE:Factory/.python-charset-normalizer.new.2988 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-charset-normalizer" Thu May 20 19:25:29 2021 rev:7 rq:894589 version:1.3.9 Changes: -------- --- /work/SRC/openSUSE:Factory/python-charset-normalizer/python-charset-normalizer.changes 2021-03-30 21:03:02.624892186 +0200 +++ /work/SRC/openSUSE:Factory/.python-charset-normalizer.new.2988/python-charset-normalizer.changes 2021-05-20 19:25:59.841711911 +0200 @@ -1,0 +2,8 @@ +Thu May 20 09:46:56 UTC 2021 - pgaj...@suse.com + +- version update to 1.3.9 + * Bugfix: bug In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload #40 + * Bugfix: bug Empty given payload for detection may cause an exception if trying to access the alphabets property. #39 + * Bugfix: bug The legacy detect function should return UTF-8-SIG if sig is present in the payload. #38 + +------------------------------------------------------------------- Old: ---- charset_normalizer-1.3.6.tar.gz New: ---- charset_normalizer-1.3.9.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-charset-normalizer.spec ++++++ --- /var/tmp/diff_new_pack.bpcUDn/_old 2021-05-20 19:26:00.337709876 +0200 +++ /var/tmp/diff_new_pack.bpcUDn/_new 2021-05-20 19:26:00.341709859 +0200 @@ -19,7 +19,7 @@ %{?!python_module:%define python_module() python-%{**} python3-%{**}} %define skip_python2 1 Name: python-charset-normalizer -Version: 1.3.6 +Version: 1.3.9 Release: 0 Summary: Python Universal Charset detector License: MIT @@ -35,7 +35,7 @@ Requires: python-loguru >= 0.5 Requires: python-zhon Requires(post): update-alternatives -Requires(postun): update-alternatives +Requires(postun):update-alternatives Suggests: python-requests Suggests: python-requests-html Suggests: python-unicodedata2 ++++++ charset_normalizer-1.3.6.tar.gz -> charset_normalizer-1.3.9.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/charset_normalizer-1.3.6/PKG-INFO new/charset_normalizer-1.3.9/PKG-INFO --- old/charset_normalizer-1.3.6/PKG-INFO 2021-02-09 01:05:13.511735400 +0100 +++ new/charset_normalizer-1.3.9/PKG-INFO 2021-05-13 22:39:08.152140000 +0200 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: charset_normalizer -Version: 1.3.6 +Version: 1.3.9 Summary: The Real First Universal Charset Detector. No Cpp Bindings, Using Voodoo and Magical Artifacts. Home-page: https://github.com/ousret/charset_normalizer Author: Ahmed TAHRI @Ousret diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/charset_normalizer-1.3.6/charset_normalizer/legacy.py new/charset_normalizer-1.3.9/charset_normalizer/legacy.py --- old/charset_normalizer-1.3.6/charset_normalizer/legacy.py 2021-02-09 01:05:05.000000000 +0100 +++ new/charset_normalizer-1.3.9/charset_normalizer/legacy.py 2021-05-13 22:38:57.000000000 +0200 @@ -19,8 +19,17 @@ r = CnM.from_bytes(byte_str).best().first() + encoding = r.encoding if r is not None else None + language = r.language if r is not None and r.language != 'Unknown' else '' + confidence = 1. - r.chaos if r is not None else None + + # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process + # but chardet does return 'utf-8-sig' and it is a valid codec name. + if encoding == 'utf_8' and r.bom: + encoding += '_sig' + return { - 'encoding': r.encoding if r is not None else None, - 'language': r.language if r is not None and r.language != 'Unknown' else '', - 'confidence': 1. - r.chaos if r is not None else None + 'encoding': encoding, + 'language': language, + 'confidence': confidence } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/charset_normalizer-1.3.6/charset_normalizer/normalizer.py new/charset_normalizer-1.3.9/charset_normalizer/normalizer.py --- old/charset_normalizer-1.3.6/charset_normalizer/normalizer.py 2021-02-09 01:05:05.000000000 +0100 +++ new/charset_normalizer-1.3.9/charset_normalizer/normalizer.py 2021-05-13 22:38:57.000000000 +0200 @@ -264,7 +264,7 @@ :return: :rtype: bytes """ - return str(self).encode(encoding) + return str(self).encode(encoding, 'replace') class CharsetNormalizerMatches: @@ -353,7 +353,7 @@ sequences, 'utf-8', 0., - [] + {} ) too_small_sequence = len(sequences) < 24 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/charset_normalizer-1.3.6/charset_normalizer/version.py new/charset_normalizer-1.3.9/charset_normalizer/version.py --- old/charset_normalizer-1.3.6/charset_normalizer/version.py 2021-02-09 01:05:05.000000000 +0100 +++ new/charset_normalizer-1.3.9/charset_normalizer/version.py 2021-05-13 22:38:57.000000000 +0200 @@ -1,6 +1,6 @@ -""" -Expose version -""" - -__version__ = "1.3.6" -VERSION = __version__.split('.') +""" +Expose version +""" + +__version__ = "1.3.9" +VERSION = __version__.split('.') diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/charset_normalizer-1.3.6/charset_normalizer.egg-info/PKG-INFO new/charset_normalizer-1.3.9/charset_normalizer.egg-info/PKG-INFO --- old/charset_normalizer-1.3.6/charset_normalizer.egg-info/PKG-INFO 2021-02-09 01:05:13.000000000 +0100 +++ new/charset_normalizer-1.3.9/charset_normalizer.egg-info/PKG-INFO 2021-05-13 22:39:07.000000000 +0200 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: charset-normalizer -Version: 1.3.6 +Version: 1.3.9 Summary: The Real First Universal Charset Detector. No Cpp Bindings, Using Voodoo and Magical Artifacts. Home-page: https://github.com/ousret/charset_normalizer Author: Ahmed TAHRI @Ousret diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/charset_normalizer-1.3.6/test/test_detect_legacy.py new/charset_normalizer-1.3.9/test/test_detect_legacy.py --- old/charset_normalizer-1.3.6/test/test_detect_legacy.py 2021-02-09 01:05:05.000000000 +0100 +++ new/charset_normalizer-1.3.9/test/test_detect_legacy.py 2021-05-13 22:38:57.000000000 +0200 @@ -62,3 +62,14 @@ r['encoding'], 'utf_7' ) + + def test_utf8_sig_not_striped(self): + r = detect( + "Hello World".encode('utf-8-sig') + ) + + with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"): + self.assertEqual( + r['encoding'], + "utf_8_sig" + ) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/charset_normalizer-1.3.6/test/test_on_byte.py new/charset_normalizer-1.3.9/test/test_on_byte.py --- old/charset_normalizer-1.3.6/test/test_on_byte.py 2021-02-09 01:05:05.000000000 +0100 +++ new/charset_normalizer-1.3.9/test/test_on_byte.py 2021-05-13 22:38:57.000000000 +0200 @@ -1,136 +1,141 @@ -import unittest - -from charset_normalizer import CharsetNormalizerMatches as CnM - - -class TestBytes(unittest.TestCase): - - def test_too_short_none(self): - self.assertIsNotNone( - CnM.from_bytes(b'\xfe\xff').best().first() - ) - - def test_empty_bytes(self): - r = CnM.from_bytes(b'').best().first() - - self.assertIsNotNone( - r - ) - - self.assertEqual( - 'utf-8', - r.encoding - ) - - def test_bom_detection(self): - with self.subTest('GB18030 UNAVAILABLE SIG'): - self.assertFalse( - CnM.from_bytes( - '????????????????????????????????????????????????'.encode('gb18030') - ).best().first().byte_order_mark - ) - - with self.subTest('GB18030 AVAILABLE SIG'): - self.assertTrue( - CnM.from_bytes( - (u'\uFEFF' + '????????????????????????????????????????????????').encode('gb18030') - ).best().first().byte_order_mark - ) - - with self.subTest('UTF-7 AVAILABLE BOM'): - self.assertTrue( - CnM.from_bytes( - b'\x2b\x2f\x76\x38' + '????????????????????????????????????????????????'.encode('utf_7') - ).best().first().byte_order_mark - ) - - with self.subTest('UTF-8 AVAILABLE BOM'): - self.assertTrue( - CnM.from_bytes( - b'\xef\xbb\xbf' + '????????????????????????????????????????????????'.encode('utf_8') - ).best().first().byte_order_mark - ) - - def test_encode_decode(self): - - with self.subTest('Encode & Detect UTF-8 WITHOUT SIG SMALL CONTENT'): - self.assertEqual( - CnM.from_bytes( - 'h\xe9llo world!\n'.encode('utf_8') - ).best().first().encoding, - 'utf_8' - ) - - with self.subTest('Encode & Detect GB18030 WITHOUT SIG'): - self.assertEqual( - CnM.from_bytes( - '????????????????????????????????????????????????'.encode('gb18030') - ).best().first().encoding, - 'gb18030' - ) - - with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'): - self.assertEqual( - CnM.from_bytes( - (u'\uFEFF' + '????????????????????????????????????????????????').encode('gb18030') - ).best().first().encoding, - 'gb18030' - ) - - with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'): - self.assertEqual( - CnM.from_bytes( - '????????????????????????????????????????????????'.encode('utf_8') - ).best().first().encoding, - 'utf_8' - ) - - with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'): - self.assertEqual( - CnM.from_bytes( - '???????????????,??????????????????????????????'.encode('utf_7') - ).best().first().encoding, - 'utf_7' - ) - - with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'): - self.assertEqual( - CnM.from_bytes( - b'\x2b\x2f\x76\x38'+'????????????????????????????????????????????????'.encode('utf_7') - ).best().first().encoding, - 'utf_7' - ) - - with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'): - self.assertEqual( - CnM.from_bytes( - 'B???????? ?????????? ?????? ?????????? ???? ??????????????????????. O???????????????????????? ???????????? ???? ???????? ??????????????????,'.encode('utf_7') - ).best().first().encoding, - 'utf_7' - ) - - with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'): - self.assertEqual( - CnM.from_bytes( - b'\xef\xbb\xbf' + '????????????????????????????????????????????????'.encode('utf_8') - ).best().first().encoding, - 'utf_8' - ) - - with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'): - self.assertEqual( - CnM.from_bytes( - 'B???????? ?????????? ?????? ?????????? ???? ??????????????????????. O???????????????????????? ???????????? ???? ???????? ??????????????????, ' - '???????? ???? ???? ???????????? ???? ?????????????????? ?? ?????????????????? ??????????????????????.'.encode('utf_8') - ).best().first().encoding, - 'utf_8' - ) - - with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'): - self.assertEqual( - CnM.from_bytes( - 'B???????? ?????????? ?????? ?????????? ???? ??????????????????????.'.encode( - 'utf_8') - ).best().first().encoding, - 'utf_8' - ) +import unittest + +from charset_normalizer import CharsetNormalizerMatches as CnM + + +class TestBytes(unittest.TestCase): + + def test_too_short_none(self): + self.assertIsNotNone( + CnM.from_bytes(b'\xfe\xff').best().first() + ) + + def test_empty_bytes(self): + r = CnM.from_bytes(b'').best().first() + + self.assertIsNotNone( + r + ) + + self.assertEqual( + 'utf-8', + r.encoding + ) + + self.assertEqual( + 0, + len(r.alphabets) + ) + + def test_bom_detection(self): + with self.subTest('GB18030 UNAVAILABLE SIG'): + self.assertFalse( + CnM.from_bytes( + '????????????????????????????????????????????????'.encode('gb18030') + ).best().first().byte_order_mark + ) + + with self.subTest('GB18030 AVAILABLE SIG'): + self.assertTrue( + CnM.from_bytes( + (u'\uFEFF' + '????????????????????????????????????????????????').encode('gb18030') + ).best().first().byte_order_mark + ) + + with self.subTest('UTF-7 AVAILABLE BOM'): + self.assertTrue( + CnM.from_bytes( + b'\x2b\x2f\x76\x38' + '????????????????????????????????????????????????'.encode('utf_7') + ).best().first().byte_order_mark + ) + + with self.subTest('UTF-8 AVAILABLE BOM'): + self.assertTrue( + CnM.from_bytes( + b'\xef\xbb\xbf' + '????????????????????????????????????????????????'.encode('utf_8') + ).best().first().byte_order_mark + ) + + def test_encode_decode(self): + + with self.subTest('Encode & Detect UTF-8 WITHOUT SIG SMALL CONTENT'): + self.assertEqual( + CnM.from_bytes( + 'h\xe9llo world!\n'.encode('utf_8') + ).best().first().encoding, + 'utf_8' + ) + + with self.subTest('Encode & Detect GB18030 WITHOUT SIG'): + self.assertEqual( + CnM.from_bytes( + '????????????????????????????????????????????????'.encode('gb18030') + ).best().first().encoding, + 'gb18030' + ) + + with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'): + self.assertEqual( + CnM.from_bytes( + (u'\uFEFF' + '????????????????????????????????????????????????').encode('gb18030') + ).best().first().encoding, + 'gb18030' + ) + + with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'): + self.assertEqual( + CnM.from_bytes( + '????????????????????????????????????????????????'.encode('utf_8') + ).best().first().encoding, + 'utf_8' + ) + + with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'): + self.assertEqual( + CnM.from_bytes( + '???????????????,??????????????????????????????'.encode('utf_7') + ).best().first().encoding, + 'utf_7' + ) + + with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'): + self.assertEqual( + CnM.from_bytes( + b'\x2b\x2f\x76\x38'+'????????????????????????????????????????????????'.encode('utf_7') + ).best().first().encoding, + 'utf_7' + ) + + with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'): + self.assertEqual( + CnM.from_bytes( + 'B???????? ?????????? ?????? ?????????? ???? ??????????????????????. O???????????????????????? ???????????? ???? ???????? ??????????????????,'.encode('utf_7') + ).best().first().encoding, + 'utf_7' + ) + + with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'): + self.assertEqual( + CnM.from_bytes( + b'\xef\xbb\xbf' + '????????????????????????????????????????????????'.encode('utf_8') + ).best().first().encoding, + 'utf_8' + ) + + with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'): + self.assertEqual( + CnM.from_bytes( + 'B???????? ?????????? ?????? ?????????? ???? ??????????????????????. O???????????????????????? ???????????? ???? ???????? ??????????????????, ' + '???????? ???? ???? ???????????? ???? ?????????????????? ?? ?????????????????? ??????????????????????.'.encode('utf_8') + ).best().first().encoding, + 'utf_8' + ) + + with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'): + self.assertEqual( + CnM.from_bytes( + 'B???????? ?????????? ?????? ?????????? ???? ??????????????????????.'.encode( + 'utf_8') + ).best().first().encoding, + 'utf_8' + )