commit python-charset-normalizer for openSUSE:Factory

Source-Sync Thu, 20 May 2021 10:28:52 -0700

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package python-charset-normalizer for 
openSUSE:Factory checked in at 2021-05-20 19:25:29
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-charset-normalizer (Old)
 and      /work/SRC/openSUSE:Factory/.python-charset-normalizer.new.2988 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-charset-normalizer"

Thu May 20 19:25:29 2021 rev:7 rq:894589 version:1.3.9

Changes:
--------
--- 
/work/SRC/openSUSE:Factory/python-charset-normalizer/python-charset-normalizer.changes
      2021-03-30 21:03:02.624892186 +0200
+++ 
/work/SRC/openSUSE:Factory/.python-charset-normalizer.new.2988/python-charset-normalizer.changes
    2021-05-20 19:25:59.841711911 +0200
@@ -1,0 +2,8 @@
+Thu May 20 09:46:56 UTC 2021 - [email protected]
+
+- version update to 1.3.9
+  * Bugfix: bug In some very rare cases, you may end up getting encode/decode 
errors due to a bad bytes payload #40
+  * Bugfix: bug Empty given payload for detection may cause an exception if 
trying to access the alphabets property. #39
+  * Bugfix: bug The legacy detect function should return UTF-8-SIG if sig is 
present in the payload. #38
+
+-------------------------------------------------------------------

Old:
----
  charset_normalizer-1.3.6.tar.gz

New:
----
  charset_normalizer-1.3.9.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-charset-normalizer.spec ++++++
--- /var/tmp/diff_new_pack.bpcUDn/_old  2021-05-20 19:26:00.337709876 +0200
+++ /var/tmp/diff_new_pack.bpcUDn/_new  2021-05-20 19:26:00.341709859 +0200
@@ -19,7 +19,7 @@
 %{?!python_module:%define python_module() python-%{**} python3-%{**}}
 %define skip_python2 1
 Name:           python-charset-normalizer
-Version:        1.3.6
+Version:        1.3.9
 Release:        0
 Summary:        Python Universal Charset detector
 License:        MIT
@@ -35,7 +35,7 @@
 Requires:       python-loguru >= 0.5
 Requires:       python-zhon
 Requires(post): update-alternatives
-Requires(postun): update-alternatives
+Requires(postun):update-alternatives
 Suggests:       python-requests
 Suggests:       python-requests-html
 Suggests:       python-unicodedata2

++++++ charset_normalizer-1.3.6.tar.gz -> charset_normalizer-1.3.9.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.3.6/PKG-INFO 
new/charset_normalizer-1.3.9/PKG-INFO
--- old/charset_normalizer-1.3.6/PKG-INFO       2021-02-09 01:05:13.511735400 
+0100
+++ new/charset_normalizer-1.3.9/PKG-INFO       2021-05-13 22:39:08.152140000 
+0200
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: charset_normalizer
-Version: 1.3.6
+Version: 1.3.9
 Summary: The Real First Universal Charset Detector. No Cpp Bindings, Using 
Voodoo and Magical Artifacts.
 Home-page: https://github.com/ousret/charset_normalizer
 Author: Ahmed TAHRI @Ousret
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.3.6/charset_normalizer/legacy.py 
new/charset_normalizer-1.3.9/charset_normalizer/legacy.py
--- old/charset_normalizer-1.3.6/charset_normalizer/legacy.py   2021-02-09 
01:05:05.000000000 +0100
+++ new/charset_normalizer-1.3.9/charset_normalizer/legacy.py   2021-05-13 
22:38:57.000000000 +0200
@@ -19,8 +19,17 @@
 
     r = CnM.from_bytes(byte_str).best().first()
 
+    encoding = r.encoding if r is not None else None
+    language = r.language if r is not None and r.language != 'Unknown' else ''
+    confidence = 1. - r.chaos if r is not None else None
+
+    # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get 
stripped in the detection/normalization process
+    # but chardet does return 'utf-8-sig' and it is a valid codec name.
+    if encoding == 'utf_8' and r.bom:
+        encoding += '_sig'
+
     return {
-        'encoding': r.encoding if r is not None else None,
-        'language': r.language if r is not None and r.language != 'Unknown' 
else '',
-        'confidence': 1. - r.chaos if r is not None else None
+        'encoding': encoding,
+        'language': language,
+        'confidence': confidence
     }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.3.6/charset_normalizer/normalizer.py 
new/charset_normalizer-1.3.9/charset_normalizer/normalizer.py
--- old/charset_normalizer-1.3.6/charset_normalizer/normalizer.py       
2021-02-09 01:05:05.000000000 +0100
+++ new/charset_normalizer-1.3.9/charset_normalizer/normalizer.py       
2021-05-13 22:38:57.000000000 +0200
@@ -264,7 +264,7 @@
         :return:
         :rtype: bytes
         """
-        return str(self).encode(encoding)
+        return str(self).encode(encoding, 'replace')
 
 
 class CharsetNormalizerMatches:
@@ -353,7 +353,7 @@
                 sequences,
                 'utf-8',
                 0.,
-                []
+                {}
             )
 
         too_small_sequence = len(sequences) < 24
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.3.6/charset_normalizer/version.py 
new/charset_normalizer-1.3.9/charset_normalizer/version.py
--- old/charset_normalizer-1.3.6/charset_normalizer/version.py  2021-02-09 
01:05:05.000000000 +0100
+++ new/charset_normalizer-1.3.9/charset_normalizer/version.py  2021-05-13 
22:38:57.000000000 +0200
@@ -1,6 +1,6 @@
-"""
-Expose version
-"""
-
-__version__ = "1.3.6"
-VERSION = __version__.split('.')
+"""
+Expose version
+"""
+
+__version__ = "1.3.9"
+VERSION = __version__.split('.')
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.3.6/charset_normalizer.egg-info/PKG-INFO 
new/charset_normalizer-1.3.9/charset_normalizer.egg-info/PKG-INFO
--- old/charset_normalizer-1.3.6/charset_normalizer.egg-info/PKG-INFO   
2021-02-09 01:05:13.000000000 +0100
+++ new/charset_normalizer-1.3.9/charset_normalizer.egg-info/PKG-INFO   
2021-05-13 22:39:07.000000000 +0200
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: charset-normalizer
-Version: 1.3.6
+Version: 1.3.9
 Summary: The Real First Universal Charset Detector. No Cpp Bindings, Using 
Voodoo and Magical Artifacts.
 Home-page: https://github.com/ousret/charset_normalizer
 Author: Ahmed TAHRI @Ousret
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.3.6/test/test_detect_legacy.py 
new/charset_normalizer-1.3.9/test/test_detect_legacy.py
--- old/charset_normalizer-1.3.6/test/test_detect_legacy.py     2021-02-09 
01:05:05.000000000 +0100
+++ new/charset_normalizer-1.3.9/test/test_detect_legacy.py     2021-05-13 
22:38:57.000000000 +0200
@@ -62,3 +62,14 @@
                 r['encoding'],
                 'utf_7'
             )
+
+    def test_utf8_sig_not_striped(self):
+        r = detect(
+            "Hello World".encode('utf-8-sig')
+        )
+
+        with self.subTest("Verify that UTF-8-SIG is returned when using legacy 
detect"):
+            self.assertEqual(
+                r['encoding'],
+                "utf_8_sig"
+            )
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.3.6/test/test_on_byte.py 
new/charset_normalizer-1.3.9/test/test_on_byte.py
--- old/charset_normalizer-1.3.6/test/test_on_byte.py   2021-02-09 
01:05:05.000000000 +0100
+++ new/charset_normalizer-1.3.9/test/test_on_byte.py   2021-05-13 
22:38:57.000000000 +0200
@@ -1,136 +1,141 @@
-import unittest
-
-from charset_normalizer import CharsetNormalizerMatches as CnM
-
-
-class TestBytes(unittest.TestCase):
-
-    def test_too_short_none(self):
-        self.assertIsNotNone(
-            CnM.from_bytes(b'\xfe\xff').best().first()
-        )
-
-    def test_empty_bytes(self):
-        r = CnM.from_bytes(b'').best().first()
-
-        self.assertIsNotNone(
-            r
-        )
-
-        self.assertEqual(
-            'utf-8',
-            r.encoding
-        )
-
-    def test_bom_detection(self):
-        with self.subTest('GB18030 UNAVAILABLE SIG'):
-            self.assertFalse(
-                CnM.from_bytes(
-                    
'????????????????????????????????????????????????'.encode('gb18030')
-                ).best().first().byte_order_mark
-            )
-
-        with self.subTest('GB18030 AVAILABLE SIG'):
-            self.assertTrue(
-                CnM.from_bytes(
-                    (u'\uFEFF' + 
'????????????????????????????????????????????????').encode('gb18030')
-                ).best().first().byte_order_mark
-            )
-
-        with self.subTest('UTF-7 AVAILABLE BOM'):
-            self.assertTrue(
-                CnM.from_bytes(
-                    b'\x2b\x2f\x76\x38' + 
'????????????????????????????????????????????????'.encode('utf_7')
-                ).best().first().byte_order_mark
-            )
-
-        with self.subTest('UTF-8 AVAILABLE BOM'):
-            self.assertTrue(
-                CnM.from_bytes(
-                    b'\xef\xbb\xbf' + 
'????????????????????????????????????????????????'.encode('utf_8')
-                ).best().first().byte_order_mark
-            )
-
-    def test_encode_decode(self):
-
-        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG SMALL CONTENT'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    'h\xe9llo world!\n'.encode('utf_8')
-                ).best().first().encoding,
-                'utf_8'
-            )
-
-        with self.subTest('Encode & Detect GB18030 WITHOUT SIG'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    
'????????????????????????????????????????????????'.encode('gb18030')
-                ).best().first().encoding,
-                'gb18030'
-            )
-
-        with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    (u'\uFEFF' + 
'????????????????????????????????????????????????').encode('gb18030')
-                ).best().first().encoding,
-                'gb18030'
-            )
-
-        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    
'????????????????????????????????????????????????'.encode('utf_8')
-                ).best().first().encoding,
-                'utf_8'
-            )
-
-        with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    
'???????????????,??????????????????????????????'.encode('utf_7')
-                ).best().first().encoding,
-                'utf_7'
-            )
-
-        with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    
b'\x2b\x2f\x76\x38'+'????????????????????????????????????????????????'.encode('utf_7')
-                ).best().first().encoding,
-                'utf_7'
-            )
-
-        with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    'B???????? ?????????? ?????? ?????????? ???? 
??????????????????????. O???????????????????????? ???????????? ???? ???????? 
??????????????????,'.encode('utf_7')
-                ).best().first().encoding,
-                'utf_7'
-            )
-
-        with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'):
-            self.assertEqual(
-                CnM.from_bytes(
-                   b'\xef\xbb\xbf' + 
'????????????????????????????????????????????????'.encode('utf_8')
-                ).best().first().encoding,
-                'utf_8'
-            )
-
-        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    'B???????? ?????????? ?????? ?????????? ???? 
??????????????????????. O???????????????????????? ???????????? ???? ???????? 
??????????????????, '
-                    '???????? ???? ???? ???????????? ???? ?????????????????? 
?? ?????????????????? ??????????????????????.'.encode('utf_8')
-                ).best().first().encoding,
-                'utf_8'
-            )
-
-        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
-            self.assertEqual(
-                CnM.from_bytes(
-                    'B???????? ?????????? ?????? ?????????? ???? 
??????????????????????.'.encode(
-                        'utf_8')
-                ).best().first().encoding,
-                'utf_8'
-            )
+import unittest
+
+from charset_normalizer import CharsetNormalizerMatches as CnM
+
+
+class TestBytes(unittest.TestCase):
+
+    def test_too_short_none(self):
+        self.assertIsNotNone(
+            CnM.from_bytes(b'\xfe\xff').best().first()
+        )
+
+    def test_empty_bytes(self):
+        r = CnM.from_bytes(b'').best().first()
+
+        self.assertIsNotNone(
+            r
+        )
+
+        self.assertEqual(
+            'utf-8',
+            r.encoding
+        )
+
+        self.assertEqual(
+            0,
+            len(r.alphabets)
+        )
+
+    def test_bom_detection(self):
+        with self.subTest('GB18030 UNAVAILABLE SIG'):
+            self.assertFalse(
+                CnM.from_bytes(
+                    
'????????????????????????????????????????????????'.encode('gb18030')
+                ).best().first().byte_order_mark
+            )
+
+        with self.subTest('GB18030 AVAILABLE SIG'):
+            self.assertTrue(
+                CnM.from_bytes(
+                    (u'\uFEFF' + 
'????????????????????????????????????????????????').encode('gb18030')
+                ).best().first().byte_order_mark
+            )
+
+        with self.subTest('UTF-7 AVAILABLE BOM'):
+            self.assertTrue(
+                CnM.from_bytes(
+                    b'\x2b\x2f\x76\x38' + 
'????????????????????????????????????????????????'.encode('utf_7')
+                ).best().first().byte_order_mark
+            )
+
+        with self.subTest('UTF-8 AVAILABLE BOM'):
+            self.assertTrue(
+                CnM.from_bytes(
+                    b'\xef\xbb\xbf' + 
'????????????????????????????????????????????????'.encode('utf_8')
+                ).best().first().byte_order_mark
+            )
+
+    def test_encode_decode(self):
+
+        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG SMALL CONTENT'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    'h\xe9llo world!\n'.encode('utf_8')
+                ).best().first().encoding,
+                'utf_8'
+            )
+
+        with self.subTest('Encode & Detect GB18030 WITHOUT SIG'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    
'????????????????????????????????????????????????'.encode('gb18030')
+                ).best().first().encoding,
+                'gb18030'
+            )
+
+        with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    (u'\uFEFF' + 
'????????????????????????????????????????????????').encode('gb18030')
+                ).best().first().encoding,
+                'gb18030'
+            )
+
+        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    
'????????????????????????????????????????????????'.encode('utf_8')
+                ).best().first().encoding,
+                'utf_8'
+            )
+
+        with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    
'???????????????,??????????????????????????????'.encode('utf_7')
+                ).best().first().encoding,
+                'utf_7'
+            )
+
+        with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    
b'\x2b\x2f\x76\x38'+'????????????????????????????????????????????????'.encode('utf_7')
+                ).best().first().encoding,
+                'utf_7'
+            )
+
+        with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    'B???????? ?????????? ?????? ?????????? ???? 
??????????????????????. O???????????????????????? ???????????? ???? ???????? 
??????????????????,'.encode('utf_7')
+                ).best().first().encoding,
+                'utf_7'
+            )
+
+        with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'):
+            self.assertEqual(
+                CnM.from_bytes(
+                   b'\xef\xbb\xbf' + 
'????????????????????????????????????????????????'.encode('utf_8')
+                ).best().first().encoding,
+                'utf_8'
+            )
+
+        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    'B???????? ?????????? ?????? ?????????? ???? 
??????????????????????. O???????????????????????? ???????????? ???? ???????? 
??????????????????, '
+                    '???????? ???? ???? ???????????? ???? ?????????????????? 
?? ?????????????????? ??????????????????????.'.encode('utf_8')
+                ).best().first().encoding,
+                'utf_8'
+            )
+
+        with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
+            self.assertEqual(
+                CnM.from_bytes(
+                    'B???????? ?????????? ?????? ?????????? ???? 
??????????????????????.'.encode(
+                        'utf_8')
+                ).best().first().encoding,
+                'utf_8'
+            )

commit python-charset-normalizer for openSUSE:Factory

Reply via email to