commit python-charset-normalizer for openSUSE:Factory

root Wed, 16 Oct 2019 00:13:14 -0700

Hello community,

here is the log from the commit of package python-charset-normalizer for 
openSUSE:Factory checked in at 2019-10-16 09:12:25
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-charset-normalizer (Old)
 and      /work/SRC/openSUSE:Factory/.python-charset-normalizer.new.2352 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-charset-normalizer"

Wed Oct 16 09:12:25 2019 rev:3 rq:734952 version:1.3.0

Changes:
--------
--- 
/work/SRC/openSUSE:Factory/python-charset-normalizer/python-charset-normalizer.changes
      2019-09-27 14:51:55.192223149 +0200
+++ 
/work/SRC/openSUSE:Factory/.python-charset-normalizer.new.2352/python-charset-normalizer.changes
    2019-10-16 09:12:29.231900258 +0200
@@ -1,0 +2,14 @@
+Fri Oct  4 08:52:51 UTC 2019 - Marketa Calabkova <[email protected]>
+
+- Update to 1.3.0
+  * Backport unicodedata for v12 impl into python if available
+  * Add aliases to CharsetNormalizerMatches class
+  * Add feature preemptive behaviour, looking for encoding declaration
+  * Add method to determine if specific encoding is multi byte
+  * Add has_submatch property on a match
+  * Add percent_chaos and percent_coherence
+  * Coherence ratio based on mean instead of sum of best results
+  * Using loguru for trace/debug <3
+  * from_byte method improved
+
+-------------------------------------------------------------------

Old:
----
  charset_normalizer-1.1.1.tar.gz

New:
----
  charset_normalizer-1.3.0.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-charset-normalizer.spec ++++++
--- /var/tmp/diff_new_pack.y4kTmJ/_old  2019-10-16 09:12:29.787898825 +0200
+++ /var/tmp/diff_new_pack.y4kTmJ/_new  2019-10-16 09:12:29.791898815 +0200
@@ -20,11 +20,10 @@
 # https://github.com/Ousret/charset_normalizer/issues/1
 %define skip_python2 1
 Name:           python-charset-normalizer
-Version:        1.1.1
+Version:        1.3.0
 Release:        0
 Summary:        Python Universal Charset detector
 License:        MIT
-Group:          Development/Languages/Python
 URL:            https://github.com/ousret/charset_normalizer
 Source:         
https://github.com/Ousret/charset_normalizer/archive/%{version}.tar.gz#/charset_normalizer-%{version}.tar.gz
 BuildRequires:  %{python_module setuptools}
@@ -34,6 +33,7 @@
 Requires:       python-PrettyTable
 Requires:       python-cached-property
 Requires:       python-dragonmapper
+Requires:       python-loguru
 Requires:       python-zhon
 Suggests:       python-requests-html
 BuildArch:      noarch
@@ -41,6 +41,7 @@
 BuildRequires:  %{python_module PrettyTable}
 BuildRequires:  %{python_module cached-property}
 BuildRequires:  %{python_module dragonmapper}
+BuildRequires:  %{python_module loguru}
 BuildRequires:  %{python_module pytest-runner}
 BuildRequires:  %{python_module zhon}
 # /SECTION

++++++ charset_normalizer-1.1.1.tar.gz -> charset_normalizer-1.3.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/README.md 
new/charset_normalizer-1.3.0/README.md
--- old/charset_normalizer-1.1.1/README.md      2019-09-23 14:45:48.000000000 
+0200
+++ new/charset_normalizer-1.3.0/README.md      2019-09-30 20:01:29.000000000 
+0200
@@ -6,7 +6,9 @@
     <img 
src="https://travis-ci.org/Ousret/charset_normalizer.svg?branch=master"/>
   </a>
   <img 
src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue"; 
/>
-  <img src="https://img.shields.io/pypi/dm/charset_normalizer.svg"/>
+  <a href="https://pepy.tech/project/charset-normalizer/";>
+    <img alt="Download Count /Month" 
src="https://pepy.tech/badge/charset-normalizer/month"/>
+  </a>
   <a href="https://github.com/ousret/charset_normalizer/blob/master/LICENSE";>
     <img alt="License: MIT" 
src="https://img.shields.io/badge/license-MIT-purple.svg"; target="_blank" />
   </a>
@@ -16,6 +18,7 @@
   <a href="https://codecov.io/gh/Ousret/charset_normalizer";>
       <img 
src="https://codecov.io/gh/Ousret/charset_normalizer/branch/master/graph/badge.svg";
 />
   </a>
+  <img alt="Download Count Total" 
src="https://pepy.tech/badge/charset-normalizer"; />
 </p>
 
 > Library that help you read text from unknown charset encoding.<br /> Project 
 > motivated by `chardet`, 
@@ -103,9 +106,8 @@
 
 ## 😇 Why
 
-When I started using Chardet, I noticed that this library was wrong most of 
the time 
-when it's not about Unicode, Gb or Big5. That because some charset are easily 
identifiable 
-because of there standards and Chardet does a really good job at identifying 
them.
+When I started using Chardet, I noticed that this library was unreliable 
nowadays and also  
+it's unmaintained, and most likely will never be.
 
 I **don't care** about the **originating charset** encoding, that because 
**two different table** can 
 produce **two identical file.**
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.1.1/charset_normalizer/__init__.py 
new/charset_normalizer-1.3.0/charset_normalizer/__init__.py
--- old/charset_normalizer-1.1.1/charset_normalizer/__init__.py 2019-09-23 
14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/__init__.py 2019-09-30 
20:01:29.000000000 +0200
@@ -1,7 +1,9 @@
 # coding: utf-8
-from charset_normalizer.normalizer import CharsetNormalizerMatches, 
CharsetNormalizerMatch
+from charset_normalizer.normalizer import CharsetNormalizerMatches, 
CharsetNormalizerMatch, \
+    CharsetDetector, CharsetDoctor, EncodingDetector  # Aliases
 from charset_normalizer.unicode import UnicodeRangeIdentify
 from charset_normalizer.probe_chaos import ProbeChaos
 from charset_normalizer.probe_coherence import ProbeCoherence
 from charset_normalizer.probe_words import ProbeWords
 from charset_normalizer.legacy import detect
+from charset_normalizer.hook import charset_normalizer_hook
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.1.1/charset_normalizer/constant.py 
new/charset_normalizer-1.3.0/charset_normalizer/constant.py
--- old/charset_normalizer-1.1.1/charset_normalizer/constant.py 2019-09-23 
14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/constant.py 2019-09-30 
20:01:29.000000000 +0200
@@ -4,6 +4,9 @@
 Scrapped from https://unicode-table.com/
 """
 from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, 
BOM_UTF32_LE
+from _multibytecodec import MultibyteIncrementalDecoder
+
+MULTI_BYTE_DECODER = MultibyteIncrementalDecoder
 
 UNICODE_RANGES = [
     "0000−001F",
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.1.1/charset_normalizer/encoding.py 
new/charset_normalizer-1.3.0/charset_normalizer/encoding.py
--- old/charset_normalizer-1.1.1/charset_normalizer/encoding.py 1970-01-01 
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/charset_normalizer/encoding.py 2019-09-30 
20:01:29.000000000 +0200
@@ -0,0 +1,15 @@
+from charset_normalizer.constant import MULTI_BYTE_DECODER
+import importlib
+
+
+def is_multi_byte_encoding(encoding_name):
+    """
+    Verify is a specific encoding is a multi byte one based on it IANA name
+    :param str encoding_name: IANA encoding name
+    :return: True if multi byte
+    :rtype: bool
+    """
+    return issubclass(
+        
importlib.import_module('encodings.{encoding_name}'.format(encoding_name=encoding_name)).IncrementalDecoder,
+        MULTI_BYTE_DECODER
+    )
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/charset_normalizer/hook.py 
new/charset_normalizer-1.3.0/charset_normalizer/hook.py
--- old/charset_normalizer-1.1.1/charset_normalizer/hook.py     1970-01-01 
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/charset_normalizer/hook.py     2019-09-30 
20:01:29.000000000 +0200
@@ -0,0 +1,20 @@
+import sys
+from charset_normalizer.legacy import detect
+
+
+def charset_normalizer_hook(exctype, value, traceback):
+    if exctype == UnicodeDecodeError:
+        cp_detection = detect(value.object)
+        if cp_detection['encoding'] is not None:
+            value.reason = value.reason+'; you may want to consider {} codec 
for this sequence.'.format(cp_detection['encoding'])
+
+    sys.__excepthook__(exctype, value, traceback)
+
+
+sys.excepthook = charset_normalizer_hook
+
+try:
+    import unicodedata2
+    sys.modules['unicodedata'] = unicodedata2
+except ImportError:
+    pass
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.1.1/charset_normalizer/normalizer.py 
new/charset_normalizer-1.3.0/charset_normalizer/normalizer.py
--- old/charset_normalizer-1.1.1/charset_normalizer/normalizer.py       
2019-09-23 14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/normalizer.py       
2019-09-30 20:01:29.000000000 +0200
@@ -13,6 +13,11 @@
 from charset_normalizer.probe_chaos import ProbeChaos
 from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter
 
+from charset_normalizer.encoding import is_multi_byte_encoding
+
+from charset_normalizer.probe_inherent_sign import any_specified_encoding
+
+from loguru import logger
 
 from hashlib import sha256
 
@@ -62,6 +67,15 @@
         """
         return self._submatch
 
+    @property
+    def has_submatch(self):
+        """
+        Determine if current match has any other match linked to it.
+        :return: True if any sub match available
+        :rtype: bool
+        """
+        return len(self._submatch) > 0
+
     @cached_property
     def alphabets(self):
         """
@@ -85,6 +99,8 @@
         :param CharsetNormalizerMatch other:
         :return:
         """
+        if not isinstance(other, CharsetNormalizerMatch):
+            raise TypeError('__eq__ cannot be invoked on {} and 
{}.'.format(str(other.__class__), str(self.__class__)))
         return self.fingerprint == other.fingerprint and self.encoding == 
other.encoding
 
     @cached_property
@@ -137,6 +153,25 @@
         """
         return self._chaos_ratio
 
+    @property
+    def percent_chaos(self):
+        """
+        Convert chaos ratio to readable percentage with ndigits=3
+        from 0.000 % to 100.000 %
+        :return: float
+        """
+        return round(self._chaos_ratio * 100, ndigits=3)
+
+    @property
+    def percent_coherence(self):
+        """
+        Convert coherence ratio to readable percentage with ndigits=3
+        from 0.000 % to 100.000 %
+        :return: float
+        :rtype: float
+        """
+        return round((1 - self.coherence) * 100, ndigits=3)
+
     @cached_property
     def chaos_secondary_pass(self):
         """
@@ -286,7 +321,7 @@
         return b_
 
     @staticmethod
-    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
+    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, 
cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         Take a sequence of bytes that could potentially be decoded to str and 
discard all obvious non supported
         charset encoding.
@@ -294,10 +329,16 @@
         :param bytes sequences: Actual sequence of bytes to analyse
         :param float threshold: Maximum amount of chaos allowed on first pass
         :param int chunk_size: Size to extract and analyse in each step
-        :param int steps: Number of steps
+        :param int steps: Number of steps/block to extract from sequence
+        :param bool preemptive_behaviour: Determine if we should look into 
sequence (ASCII-Mode) for pre-defined encoding
+        :param bool explain: Print on screen what is happening when searching 
for a match
+        :param list[str] cp_isolation: Finite list of encoding to use when 
searching for a match
+        :param list[str] cp_exclusion: Finite list of encoding to avoid when 
searching for a match
         :return: List of potential matches
         :rtype: CharsetNormalizerMatches
         """
+        if not explain:
+            logger.disable('charset_normalizer')
 
         too_small_sequence = len(sequences) < 24
 
@@ -308,13 +349,32 @@
 
         # Adjust steps and chunk_size when content is just too small for it
         if maximum_length <= (chunk_size * steps):
+            logger.warning(
+                'override steps and chunk_size as content does not fit 
parameters.',
+                chunk_size=chunk_size, steps=steps, seq_len=maximum_length)
             steps = 1
-
-        if maximum_length <= chunk_size:
             chunk_size = maximum_length
-        elif steps > 1 and maximum_length / steps < chunk_size:
+
+        if steps > 1 and maximum_length / steps < chunk_size:
             chunk_size = int(maximum_length / steps)
 
+        if cp_isolation is not None and isinstance(cp_isolation, list) is 
False:
+            raise TypeError('cp_isolation must be None or list')
+
+        if cp_exclusion is not None and isinstance(cp_exclusion, list) is 
False:
+            raise TypeError('cp_exclusion must be None or list')
+
+        if cp_isolation is not None:
+            logger.warning('cp_isolation is set. use this flag for debugging 
purpose. '
+                           'limited list of encoding allowed : 
{allowed_list}.',
+                           allowed_list=', '.join(cp_isolation))
+
+        if cp_exclusion is not None:
+            logger.warning(
+                'cp_exclusion is set. use this flag for debugging purpose. '
+                'limited list of encoding excluded : {excluded_list}.',
+                excluded_list=', '.join(cp_exclusion))
+
         # Bellow Python 3.6, Expect dict to not behave the same.
         py_v = [int(el) for el in python_version_tuple()]
         py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)
@@ -324,10 +384,23 @@
         tested = set()
         matches = list()
 
+        specified_encoding = any_specified_encoding(sequences) if 
preemptive_behaviour is True else None
+
+        if specified_encoding is not None:
+            warn(
+                'Trying to detect encoding on a sequence that seems to declare 
a encoding ({}).'.format(specified_encoding)
+            )
+
         for support in supported:
 
             k, p = support
 
+            if cp_isolation is not None and p not in cp_isolation:
+                continue
+
+            if cp_exclusion is not None and p in cp_exclusion:
+                continue
+
             if p in tested:
                 continue
 
@@ -347,24 +420,36 @@
                         if any(bom_c_list) is True:
                             bom_available = True
                             bom_len = 
len(BYTE_ORDER_MARK[p][bom_c_list.index(True)])
+                    if bom_available is True:
+                        logger.info('{encoding} has a SIG or BOM mark on first 
{n_byte} byte(s).  Adding chaos bonus.', encoding=p, n_byte=bom_len)
 
                 str(
                     sequences if bom_available is False else 
sequences[bom_len:],
                     encoding=p
                 )
 
-            except UnicodeDecodeError:
+            except UnicodeDecodeError as e:
+                logger.debug('{encoding} does not fit given bytes sequence at 
ALL. {explanation}', encoding=p, explanation=str(e))
                 continue
             except LookupError:
                 continue
 
+            is_multi_byte_enc = is_multi_byte_encoding(p)
+
+            if is_multi_byte_enc is True:
+                logger.info('{encoding} is a multi byte encoding table. '
+                            'Should not be a coincidence. Adding chaos bonus.',
+                            encoding=p)
+            else:
+                logger.debug('{encoding} is a single byte encoding table.', 
encoding=p)
+
             r_ = range(
                 0 if bom_available is False else bom_len,
                 maximum_length,
                 int(maximum_length / steps)
             )
 
-            measures = [ProbeChaos(str(sequences[i:i + chunk_size], 
encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_]
+            measures = [ProbeChaos(str(sequences[i:i + chunk_size], 
encoding=p, errors='ignore'), giveup_threshold=threshold, 
bonus_bom_sig=bom_available, bonus_multi_byte=is_multi_byte_enc) for i in r_]
             ratios = [el.ratio for el in measures]
             nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el 
in measures].count(True)
 
@@ -373,8 +458,13 @@
             # chaos_min = min(ratios)
             # chaos_max = max(ratios)
 
-            if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > 
threshold:
-                # print(p, 'is too much chaos for decoded input !', 
nb_gave_up, chaos_median)
+            if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_means > 
threshold:
+                logger.warning('{encoding} was excluded because of initial 
chaos probing. '
+                                      'Gave up {nb_gave_up} time(s). '
+                                      'Computed median chaos is {chaos_median} 
%.',
+                                      encoding=p,
+                                      nb_gave_up=nb_gave_up,
+                                      chaos_median=round(chaos_means*100, 
ndigits=3))
                 continue
 
             encountered_unicode_range_occurrences = dict()
@@ -385,8 +475,6 @@
                         encountered_unicode_range_occurrences[u_name] = 0
                     encountered_unicode_range_occurrences[u_name] += u_occ
 
-            # print(p, 'U RANGES', encountered_unicode_range_occurrences)
-
             cnm = CharsetNormalizerMatch(
                 sequences if not bom_available else sequences[bom_len:],
                 p,
@@ -395,55 +483,84 @@
                 bom_available
             )
 
+            logger.info(
+                '{encoding} passed initial chaos probing. '
+                'Measured chaos is {chaos_means} % and coherence is 
{coherence} %. '
+                'It seems to be written in {language}.',
+                encoding=p,
+                chaos_means=round(chaos_means*100, ndigits=3),
+                coherence=cnm.percent_coherence,
+                language=cnm.languages
+            )
+
             fingerprint_tests = [el.fingerprint == cnm.fingerprint for el in 
matches]
 
             if any(fingerprint_tests) is True:
                 matches[fingerprint_tests.index(True)].submatch.append(cnm)
+                logger.debug('{encoding} is marked as a submatch of 
{primary_encoding}.', encoding=cnm.encoding, 
primary_encoding=matches[fingerprint_tests.index(True)].encoding)
             else:
                 matches.append(
-                    CharsetNormalizerMatch(
-                        sequences if not bom_available else 
sequences[bom_len:],
-                        p,
-                        chaos_means,
-                        encountered_unicode_range_occurrences,
-                        bom_available
-                    )
+                    cnm
                 )
 
-            # print(p, nb_gave_up, chaos_means, chaos_median, 
matches[-1].coherence, matches[-1].languages,)
+            if specified_encoding is not None and p == specified_encoding:
+                logger.info('{encoding} is most likely the one. '
+                            'Because it is specified in analysed byte sequence 
and '
+                            'initial test passed successfully. '
+                            'Disable this behaviour by setting 
preemptive_behaviour '
+                            'to False', encoding=specified_encoding)
+                return CharsetNormalizerMatches([cnm]) if 
any(fingerprint_tests) is False else 
CharsetNormalizerMatches([matches[fingerprint_tests.index(True)]])
 
             if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
+                logger.info('{encoding} is most likely the one. 
{bom_available}',
+                                   encoding=p,
+                                   bom_available='BOM/SIG available' if 
bom_available else '')
+
                 return CharsetNormalizerMatches([matches[-1]])
 
         return CharsetNormalizerMatches(matches)
 
     @staticmethod
-    def from_fp(fp, steps=10, chunk_size=512, threshold=0.20):
+    def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, 
cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         :param io.BinaryIO fp:
         :param int steps:
         :param int chunk_size:
         :param float threshold:
-        :return:
+        :param bool explain: Print on screen what is happening when searching 
for a match
+        :param bool preemptive_behaviour: Determine if we should look into 
sequence (ASCII-Mode) for pre-defined encoding
+        :param list[str] cp_isolation: Finite list of encoding to use when 
searching for a match
+        :param list[str] cp_exclusion: Finite list of encoding to avoid when 
searching for a match
+        :return: List of potential matches
+        :rtype: CharsetNormalizerMatches
         """
         return CharsetNormalizerMatches.from_bytes(
             bytearray(fp.read()),
             steps,
             chunk_size,
-            threshold
+            threshold,
+            cp_isolation,
+            cp_exclusion,
+            preemptive_behaviour,
+            explain
         )
 
     @staticmethod
-    def from_path(path, steps=10, chunk_size=512, threshold=0.20):
+    def from_path(path, steps=10, chunk_size=512, threshold=0.20, 
cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         :param str path:
         :param int steps:
         :param int chunk_size:
         :param float threshold:
-        :return:
+        :param bool preemptive_behaviour: Determine if we should look into 
sequence (ASCII-Mode) for pre-defined encoding
+        :param bool explain: Print on screen what is happening when searching 
for a match
+        :param list[str] cp_isolation: Finite list of encoding to use when 
searching for a match
+        :param list[str] cp_exclusion: Finite list of encoding to avoid when 
searching for a match
+        :return: List of potential matches
+        :rtype: CharsetNormalizerMatches
         """
         with open(path, 'rb') as fp:
-            return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, 
threshold)
+            return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, 
threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain)
 
     @cached_property
     def could_be_from_charset(self):
@@ -470,42 +587,39 @@
         :rtype: CharsetNormalizerMatches | CharsetNormalizerMatch
         """
 
-        lowest_ratio = None
-        lowest_ratio_frequency = None
+        if len(self) == 0:
+            logger.error('Trying to call best() on empty 
CharsetNormalizerMatches, that is sad.')
+            return CharsetNormalizerMatches(self._matches)
+        elif len(self) == 1:
+            logger.debug('best() is not required because there is only one 
match in it.')
+            return CharsetNormalizerMatches(self._matches)
 
-        match_per_ratio = dict()
-        match_per_frequency_letter = dict()
+        logger.info('We need to choose between {nb_suitable_match} match. 
Order By Chaos Then Coherence.', nb_suitable_match=len(self))
 
-        for match in self._matches:
+        sorted_matches = sorted(self._matches, key=lambda x: x.chaos)
 
-            if match.chaos not in match_per_ratio.keys():
-                match_per_ratio[match.chaos] = list()
+        nb_lowest_ratio = [el.chaos <= sorted_matches[0].chaos * 1.2 for el in 
sorted_matches[1:]].count(True)
 
-            match_per_ratio[match.chaos].append(match)
+        logger.info('Lowest Chaos found is {lowest_chaos} %. Reduced list to 
{nb_suitable_match} match.', lowest_chaos=sorted_matches[0].percent_chaos, 
nb_suitable_match=nb_lowest_ratio+1)
 
-            if lowest_ratio is None or lowest_ratio > match.chaos:
-                lowest_ratio = match.chaos
+        if nb_lowest_ratio+1 > 1:
+            logger.info('Order By Chaos is not enough, {nb_suitable_match} 
remaining. Next, ordering by Coherence.', nb_suitable_match=nb_lowest_ratio+1)
 
-        if lowest_ratio is None:
-            return CharsetNormalizerMatches([])
+            sorted_matches_second_pass = 
sorted(sorted_matches[:nb_lowest_ratio+1], key=lambda x: x.coherence)
+            nb_lowest_ratio = [el.coherence == 
sorted_matches_second_pass[0].coherence for el in 
sorted_matches_second_pass[1:]].count(True)
 
-        all_latin_basic = True
+            logger.info('Highest Coherence found is {lowest_chaos} %. Reduced 
list to {nb_suitable_match} match.', 
lowest_chaos=sorted_matches_second_pass[0].percent_coherence, 
nb_suitable_match=nb_lowest_ratio+1)
 
-        for match in match_per_ratio[lowest_ratio]:  # type: 
CharsetNormalizerMatch
-            secondary_ratio = match.coherence
-
-            if lowest_ratio_frequency is None or lowest_ratio_frequency > 
secondary_ratio:
-                lowest_ratio_frequency = secondary_ratio
-
-            if secondary_ratio not in match_per_frequency_letter.keys():
-                match_per_frequency_letter[secondary_ratio] = list()
-
-            match_per_frequency_letter[secondary_ratio].append(match)
+            return CharsetNormalizerMatches(
+                sorted_matches_second_pass[:nb_lowest_ratio+1]
+            )
 
-            if len(match.alphabets) != 1 or match.alphabets[0] != 'Basic 
Latin':
-                all_latin_basic = False
+        return CharsetNormalizerMatches(
+            sorted_matches[:nb_lowest_ratio+1]
+        )
 
-        if all_latin_basic is True:
-            return 
CharsetNormalizerMatches(match_per_frequency_letter[lowest_ratio_frequency]).first()
 
-        return 
CharsetNormalizerMatches(match_per_frequency_letter[lowest_ratio_frequency]) if 
len(match_per_frequency_letter[lowest_ratio_frequency]) > 1 else 
CharsetNormalizerMatches(match_per_frequency_letter[lowest_ratio_frequency]).first()
+# Some aliases to CharsetNormalizerMatches, because it is too long for a class 
name.
+CharsetDetector = CharsetNormalizerMatches
+EncodingDetector = CharsetNormalizerMatches
+CharsetDoctor = CharsetNormalizerMatches
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.1.1/charset_normalizer/probe_chaos.py 
new/charset_normalizer-1.3.0/charset_normalizer/probe_chaos.py
--- old/charset_normalizer-1.1.1/charset_normalizer/probe_chaos.py      
2019-09-23 14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/probe_chaos.py      
2019-09-30 20:01:29.000000000 +0200
@@ -14,10 +14,12 @@
 @lru_cache(maxsize=8192)
 class ProbeChaos:
 
-    def __init__(self, string, giveup_threshold=0.09):
+    def __init__(self, string, giveup_threshold=0.09, bonus_bom_sig=False, 
bonus_multi_byte=False):
         """
         :param str string:
         :param float giveup_threshold: When to give up even if _probe has not 
finished yet
+        :param bool bonus_bom_sig: Decide if ratio should take in 
consideration a bonus because of BOM/SIG
+        :param bool bonus_multi_byte: Decide if ratio should take in 
consideration a bonus because of multi byte scheme decoder
         """
 
         if not isinstance(string, str):
@@ -26,6 +28,9 @@
         self._string = string
         self._threshold = giveup_threshold
 
+        self._bonus_bom_sig = bonus_bom_sig
+        self._bonus_multi_byte = bonus_multi_byte
+
         self.successive_upper_lower = 0
         self.successive_accent = 0
         self.successive_different_unicode_range = 0
@@ -46,14 +51,18 @@
         self.total_letter_encountered = 0
 
         self.total_lower_letter_encountered = 0
+        self.total_upper_letter_encountered = 0
+
         self.total_upper_accent_encountered = 0
         self.total_upper_accent_encountered_inner = 0
+
         self.total_unaccented_letter_encountered = 0
 
         self._probe_word = ProbeWords(HashableCounter(self._string.split()))
 
         self.gave_up = False
 
+        # Artificially increase string size to get more significant result.
         if 32 > len(self._string) > 0:
             self._string *= int(32 / len(self._string)) + 1
 
@@ -165,6 +174,9 @@
             if is_lower:
                 self.total_lower_letter_encountered += 1
 
+            if is_upper:
+                self.total_upper_letter_encountered += 1
+
             if is_upper and is_accent:
                 self.total_upper_accent_encountered += 1
                 if self.previous_printable_letter.isalpha():
@@ -237,7 +249,14 @@
         :return: Ratio as floating number
         :rtype: float
         """
-        r_ = self.total_upper_accent_encountered if 
self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered 
/ self.total_letter_encountered < 0.5 else 0
+
+        r_ = self.total_upper_accent_encountered if 
self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 
else 0
+        q_ = self.total_upper_letter_encountered / 3 if 
self.total_upper_letter_encountered > self.total_lower_letter_encountered * 0.4 
else 0
         z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(len(self._string), 
self.encountered_unicode_range_occurrences)
         p_ = self.encountered_punc_sign if self.encountered_punc_sign / 
len(self._string) > 0.2 else 0
-        return ((r_ + p_ + self.successive_upper_lower + 
self.successive_accent + self.successive_different_unicode_range + 
self.not_encountered_white_space + self.unprintable + z_ + 
ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, 
self.encountered_unicode_range_occurrences)) / len(self._string)) + 
self._probe_word.ratio  # + len(self.encountered_unicode_range)-1
+
+        bonus_sig_bom = -int(len(self._string)*0.5) if self._bonus_bom_sig is 
True else 0
+
+        initial_ratio = ((r_ + p_ + q_ + self.successive_upper_lower + 
self.successive_accent + self.successive_different_unicode_range + 
self.not_encountered_white_space + self.unprintable + z_ + bonus_sig_bom + 
ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, 
self.encountered_unicode_range_occurrences)) / len(self._string)) + 
self._probe_word.ratio  # + len(self.encountered_unicode_range)-1
+
+        return initial_ratio / 1.3 if self._bonus_multi_byte is True and 
initial_ratio > 0. else initial_ratio
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.1.1/charset_normalizer/probe_coherence.py 
new/charset_normalizer-1.3.0/charset_normalizer/probe_coherence.py
--- old/charset_normalizer-1.1.1/charset_normalizer/probe_coherence.py  
2019-09-23 14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/probe_coherence.py  
2019-09-30 20:01:29.000000000 +0200
@@ -1,5 +1,6 @@
 # coding: utf-8
 import json
+import statistics
 from collections import Counter
 from functools import lru_cache
 from os.path import dirname, realpath, exists
@@ -85,7 +86,7 @@
 
         ratios = [self.rank_per_lang[lg] for lg in languages]
 
-        return sum(ratios) / 2 if self.non_latin_covered_any is True else 
sum(ratios)
+        return statistics.mean(ratios) / 2 if self.non_latin_covered_any is 
True else statistics.mean(ratios)
 
     @property
     def coverage(self):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.1.1/charset_normalizer/probe_inherent_sign.py 
new/charset_normalizer-1.3.0/charset_normalizer/probe_inherent_sign.py
--- old/charset_normalizer-1.1.1/charset_normalizer/probe_inherent_sign.py      
1970-01-01 01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/charset_normalizer/probe_inherent_sign.py      
2019-09-30 20:01:29.000000000 +0200
@@ -0,0 +1,39 @@
+from re import findall, compile, IGNORECASE
+from encodings.aliases import aliases
+
+RE_POSSIBLE_ENCODING_INDICATION = compile(
+    r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= 
]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',
+    IGNORECASE
+)
+
+
+def any_specified_encoding(sequence):
+    """
+    Search in sequence (ASCII-mode) if there is any sign of declared encoding.
+    :param bytes sequence:
+    :return: Declared encoding if any else None
+    :rtype: str
+    """
+    if not isinstance(sequence, bytes) and not isinstance(sequence, bytearray):
+        raise TypeError
+
+    seq_len = len(sequence)
+
+    results = findall(
+        RE_POSSIBLE_ENCODING_INDICATION,
+        sequence[:seq_len if seq_len <= 2048 else 
int(seq_len*0.3)].decode('ascii', errors='ignore')
+    )  # type: list[str]
+
+    if len(results) == 0:
+        return None
+
+    for specified_encoding in results:
+        specified_encoding = specified_encoding.lower().replace('-', '_')
+
+        for a, b in aliases.items():
+            if a == specified_encoding:
+                return b
+            if b == specified_encoding:
+                return b
+
+    return None
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/charset_normalizer-1.1.1/charset_normalizer/unicode.py 
new/charset_normalizer-1.3.0/charset_normalizer/unicode.py
--- old/charset_normalizer-1.1.1/charset_normalizer/unicode.py  2019-09-23 
14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/unicode.py  2019-09-30 
20:01:29.000000000 +0200
@@ -92,6 +92,8 @@
         items = encountered_unicode_range_occurrences.items()
         s_ = 0
 
+        # print(encountered_unicode_range_occurrences)
+
         for k, v in items:
             k_ = k.lower()
             if (
@@ -101,7 +103,10 @@
                         continue
                     if 'halfwidth and fullwidth forms' in k_ and any(['CJK' in 
el for el in encountered_unicode_range_occurrences.keys()]):
                         continue
-                    s_ += v if 'geometric shapes' not in k_ else v * 10
+                    if 'hiragana' in k_ or 'katakana' in k_:
+                        continue
+                    # print('suspicious', k_, 'with', v)
+                    s_ += v
 
         return s_
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/Makefile 
new/charset_normalizer-1.3.0/docs/Makefile
--- old/charset_normalizer-1.1.1/docs/Makefile  1970-01-01 01:00:00.000000000 
+0100
+++ new/charset_normalizer-1.3.0/docs/Makefile  2019-09-30 20:01:29.000000000 
+0200
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python -msphinx
+SPHINXPROJ    = Charset Normalizer
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+       @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+       @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/advanced_search.rst 
new/charset_normalizer-1.3.0/docs/advanced_search.rst
--- old/charset_normalizer-1.1.1/docs/advanced_search.rst       1970-01-01 
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/advanced_search.rst       2019-09-30 
20:01:29.000000000 +0200
@@ -0,0 +1,20 @@
+Advanced Search
+===============
+
+Charset Normalizer method ``from_bytes``, ``from_fp`` and ``from_path`` 
provide some
+optional parameters that can be tweaked.
+
+As follow ::
+
+    CharsetDetector.from_bytes(
+        my_byte_str,
+        steps=10,  # Number of steps/block to extract from my_byte_str
+        chunk_size=512,  # Set block size of each extraction
+        threshold=0.2,  # Maximum amount of chaos allowed on first pass
+        cp_isolation=None,  # Finite list of encoding to use when searching 
for a match
+        cp_exclusion=None,  # Finite list of encoding to avoid when searching 
for a match
+        preemptive_behaviour=True,  # Determine if we should look into 
my_byte_str (ASCII-Mode) for pre-defined encoding
+        explain=False  # Print on screen what is happening when searching for 
a match
+    )
+
+!! Warning !! Work in Progress Documentation !!
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/conf.py 
new/charset_normalizer-1.3.0/docs/conf.py
--- old/charset_normalizer-1.1.1/docs/conf.py   1970-01-01 01:00:00.000000000 
+0100
+++ new/charset_normalizer-1.3.0/docs/conf.py   2019-09-30 20:01:29.000000000 
+0200
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# charset-normalizer documentation build configuration file, created by
+# sphinx-quickstart on Fri Jun 16 04:30:35 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+from recommonmark.parser import CommonMarkParser
+import sphinx_rtd_theme
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+# source_suffix = '.rst'
+
+source_parsers = {
+    '.md': CommonMarkParser,
+}
+
+source_suffix = ['.rst', '.md']
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'charset_normalizer'
+copyright = '2019, Ahmed TAHRI'
+author = 'Ahmed TAHRI @Ousret'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.1'
+# The full version, including alpha/beta/rc tags.
+release = '1.1.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'charset-normalizer-doc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'charset-normalizer.tex', 'Charset Normalizer Documentation',
+     'Ahmed TAHRI', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'charset-normalizer', 'Charset Normalizer Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Charset Normalizer', 'Charsert Normalizer Documentation',
+     author, 'charset-normalizer', '🎁 Maintained library on encoding & 
language detection. 🚀No Cpp Bindings, Using Voodoo and Magical Artifacts. 🔎 
Like Chardet',
+     'Miscellaneous'),
+]
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/getstarted.rst 
new/charset_normalizer-1.3.0/docs/getstarted.rst
--- old/charset_normalizer-1.1.1/docs/getstarted.rst    1970-01-01 
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/getstarted.rst    2019-09-30 
20:01:29.000000000 +0200
@@ -0,0 +1,70 @@
+Installation
+============
+
+This installs a package that can be used from Python (``import 
charset_normalizer``).
+
+To install for all users on the system, administrator rights (root)
+may be required.
+
+From PyPI
+---------
+Charset Normalizer can be installed from PyPI::
+
+    pip install charset-normalizer
+
+You may enable extra feature Unicode Data v12 backport as follow::
+
+    pip install charset-normalizer[UnicodeDataBackport]
+
+From git via dev-master
+-----------------------
+You can install from dev-master branch using git::
+
+    git clone https://github.com/Ousret/charset_normalizer.git
+    cd charset_normalizer/
+    python setup.py install
+
+Basic Usage
+===========
+
+The new way
+-----------
+
+You may want to get right to it. ::
+
+    from charset_normalizer import CharsetDetector
+
+    # This is going to print out your sequence once encoding has been detected
+    print(
+        CharsetDetector.from_bytes(
+            my_byte_str
+        ).best().first()
+    )
+
+    # You could also want the same from a file
+    print(
+        CharsetDetector.from_path(
+            './data/sample.1.ar.srt'
+        ).best().first()
+    )
+
+
+Backward compatibility
+----------------------
+
+If you were used to python chardet, we are providing the very same 
``detect()`` method as chardet.
+
+ ::
+
+    from charset_normalizer import detect
+
+    # This will behave exactly the same as python chardet
+    result = detect(my_byte_str)
+
+    if result['encoding'] is not None:
+        print('got', result['encoding'], 'as detected encoding')
+
+
+You may upgrade your code with ease.
+CTRL + R ``from chardet import detect`` to ``from charset_normalizer import 
detect``.
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/handling_result.rst 
new/charset_normalizer-1.3.0/docs/handling_result.rst
--- old/charset_normalizer-1.1.1/docs/handling_result.rst       1970-01-01 
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/handling_result.rst       2019-09-30 
20:01:29.000000000 +0200
@@ -0,0 +1,5 @@
+================
+ Handling Result
+================
+
+!! Warning !! Work in Progress Documentation !!
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/index.rst 
new/charset_normalizer-1.3.0/docs/index.rst
--- old/charset_normalizer-1.1.1/docs/index.rst 1970-01-01 01:00:00.000000000 
+0100
+++ new/charset_normalizer-1.3.0/docs/index.rst 2019-09-30 20:01:29.000000000 
+0200
@@ -0,0 +1,48 @@
+===================
+ Charset Normalizer
+===================
+
+Overview
+========
+
+Library that help you read text from unknown charset encoding.
+Project motivated by chardet, I'm trying to resolve the issue by taking 
another approach.
+All IANA character set names for which the Python core library provides codecs 
are supported.
+
+.. image:: 
https://repository-images.githubusercontent.com/200259335/d3da9600-dedc-11e9-83e8-081f597505df
+   :width: 500px
+   :scale: 100 %
+   :alt: CLI Charset Normalizer
+   :align: right
+
+
+It is released under MIT license, see LICENSE for more
+details. Be aware that no warranty of any kind is provided with this package.
+
+Copyright (C) 2019 Ahmed TAHRI @Ousret <ahmed(dot)tahri(at)cloudnursery.dev>
+
+!! Warning !! Work in Progress Documentation !!
+
+Features
+========
+
+- Encoding detection on a buffer, bytes or file.
+- Transpose any encoded content to Unicode the best we can.
+- Detect spoken language in text.
+
+Contents:
+
+.. toctree::
+    :maxdepth: 2
+
+    support
+    getstarted
+    advanced_search
+    handling_result
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/requirements.txt 
new/charset_normalizer-1.3.0/docs/requirements.txt
--- old/charset_normalizer-1.1.1/docs/requirements.txt  1970-01-01 
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/requirements.txt  2019-09-30 
20:01:29.000000000 +0200
@@ -0,0 +1 @@
+sphinx_rtd_theme
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/support.rst 
new/charset_normalizer-1.3.0/docs/support.rst
--- old/charset_normalizer-1.1.1/docs/support.rst       1970-01-01 
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/support.rst       2019-09-30 
20:01:29.000000000 +0200
@@ -0,0 +1,167 @@
+=================
+ Support
+=================
+
+!! Warning !! Work in Progress Documentation !!
+
+-------
+Supported Encodings
+-------
+
+Charset Normalizer is able to detect any of those encoding.
+
++-----------------+----------------------------------------------------------------------------------------------------------------------------------+
+|  IANA Code Page |                                                            
 Aliases                                                              |
++=================+==================================================================================================================================+
+|      ascii      | 646, ansi_x3.4_1968, ansi_x3_4_1968, ansi_x3.4_1986, 
cp367, csascii, ibm367, iso646_us, iso_646.irv_1991, iso_ir_6, us, us_ascii |
+|       big5      |                                               big5_tw, 
csbig5, x_mac_trad_chinese                                                |
+|    big5hkscs    |                                                        
big5_hkscs, hkscs                                                         |
+|      cp037      |                      037, csibm037, ebcdic_cp_ca, 
ebcdic_cp_nl, ebcdic_cp_us, ebcdic_cp_wt, ibm037, ibm039                       |
+|      cp1026     |                                                     1026, 
csibm1026, ibm1026                                                     |
+|      cp1125     |                                                  1125, 
ibm1125, cp866u, ruscii                                                   |
+|      cp1140     |                                                          
1140, ibm1140                                                           |
+|      cp1250     |                                                        
1250, windows_1250                                                        |
+|      cp1251     |                                                        
1251, windows_1251                                                        |
+|      cp1252     |                                                        
1252, windows_1252                                                        |
+|      cp1253     |                                                        
1253, windows_1253                                                        |
+|      cp1254     |                                                        
1254, windows_1254                                                        |
+|      cp1255     |                                                        
1255, windows_1255                                                        |
+|      cp1256     |                                                        
1256, windows_1256                                                        |
+|      cp1257     |                                                        
1257, windows_1257                                                        |
+|      cp1258     |                                                        
1258, windows_1258                                                        |
+|      cp273      |                                                      273, 
ibm273, csibm273                                                       |
+|      cp424      |                                               424, 
csibm424, ebcdic_cp_he, ibm424                                                |
+|      cp437      |                                                  437, 
cspc8codepage437, ibm437                                                   |
+|      cp500      |                                        500, csibm500, 
ebcdic_cp_be, ebcdic_cp_ch, ibm500                                         |
+|      cp775      |                                                    775, 
cspc775baltic, ibm775                                                    |
+|      cp850      |                                                 850, 
cspc850multilingual, ibm850                                                 |
+|      cp852      |                                                      852, 
cspcp852, ibm852                                                       |
+|      cp855      |                                                      855, 
csibm855, ibm855                                                       |
+|      cp857      |                                                      857, 
csibm857, ibm857                                                       |
+|      cp858      |                                                      858, 
csibm858, ibm858                                                       |
+|      cp860      |                                                      860, 
csibm860, ibm860                                                       |
+|      cp861      |                                                   861, 
cp_is, csibm861, ibm861                                                   |
+|      cp862      |                                                 862, 
cspc862latinhebrew, ibm862                                                  |
+|      cp863      |                                                      863, 
csibm863, ibm863                                                       |
+|      cp864      |                                                      864, 
csibm864, ibm864                                                       |
+|      cp865      |                                                      865, 
csibm865, ibm865                                                       |
+|      cp866      |                                                      866, 
csibm866, ibm866                                                       |
+|      cp869      |                                                   869, 
cp_gr, csibm869, ibm869                                                   |
+|      cp932      |                                                  932, 
ms932, mskanji, ms_kanji                                                   |
+|      cp949      |                                                         
949, ms949, uhc                                                          |
+|      cp950      |                                                            
950, ms950                                                            |
+|   euc_jis_2004  |                                                jisx0213, 
eucjis2004, euc_jis2004                                                 |
+|   euc_jisx0213  |                                                           
eucjisx0213                                                            |
+|      euc_jp     |                                                        
eucjp, ujis, u_jis                                                        |
+|      euc_kr     |                       euckr, korean, ksc5601, ks_c_5601, 
ks_c_5601_1987, ksx1001, ks_x_1001, x_mac_korean                        |
+|     gb18030     |                                                           
gb18030_2000                                                           |
+|      gb2312     |           chinese, csiso58gb231280, euc_cn, euccn, 
eucgb2312_cn, gb2312_1980, gb2312_80, iso_ir_58, x_mac_simp_chinese           |
+|       gbk       |                                                        
936, cp936, ms936                                                         |
+|    hp_roman8    |                                                      
roman8, r8, csHPRoman8                                                      |
+|        hz       |                                                     hzgb, 
hz_gb, hz_gb_2312                                                      |
+|    iso2022_jp   |                                               csiso2022jp, 
iso2022jp, iso_2022_jp                                                |
+|   iso2022_jp_1  |                                                    
iso2022jp_1, iso_2022_jp_1                                                    |
+|   iso2022_jp_2  |                                                    
iso2022jp_2, iso_2022_jp_2                                                    |
+| iso2022_jp_2004 |                                                 
iso_2022_jp_2004, iso2022jp_2004                                                
 |
+|   iso2022_jp_3  |                                                    
iso2022jp_3, iso_2022_jp_3                                                    |
+|  iso2022_jp_ext |                                                  
iso2022jp_ext, iso_2022_jp_ext                                                  
|
+|    iso2022_kr   |                                               csiso2022kr, 
iso2022kr, iso_2022_kr                                                |
+|    iso8859_10   |                                csisolatin6, iso_8859_10, 
iso_8859_10_1992, iso_ir_157, l6, latin6                                |
+|    iso8859_11   |                                               thai, 
iso_8859_11, iso_8859_11_2001                                                |
+|    iso8859_13   |                                                     
iso_8859_13, l7, latin7                                                      |
+|    iso8859_14   |                                iso_8859_14, 
iso_8859_14_1998, iso_celtic, iso_ir_199, l8, latin8                            
     |
+|    iso8859_15   |                                                     
iso_8859_15, l9, latin9                                                      |
+|    iso8859_16   |                                     iso_8859_16, 
iso_8859_16_2001, iso_ir_226, l10, latin10                                      
|
+|    iso8859_2    |                                 csisolatin2, iso_8859_2, 
iso_8859_2_1987, iso_ir_101, l2, latin2                                 |
+|    iso8859_3    |                                 csisolatin3, iso_8859_3, 
iso_8859_3_1988, iso_ir_109, l3, latin3                                 |
+|    iso8859_4    |                                 csisolatin4, iso_8859_4, 
iso_8859_4_1988, iso_ir_110, l4, latin4                                 |
+|    iso8859_5    |                              csisolatincyrillic, cyrillic, 
iso_8859_5, iso_8859_5_1988, iso_ir_144                               |
+|    iso8859_6    |                      arabic, asmo_708, csisolatinarabic, 
ecma_114, iso_8859_6, iso_8859_6_1987, iso_ir_127                       |
+|    iso8859_7    |                   csisolatingreek, ecma_118, elot_928, 
greek, greek8, iso_8859_7, iso_8859_7_1987, iso_ir_126                    |
+|    iso8859_8    |                                csisolatinhebrew, hebrew, 
iso_8859_8, iso_8859_8_1988, iso_ir_138                                 |
+|    iso8859_9    |                                 csisolatin5, iso_8859_9, 
iso_8859_9_1989, iso_ir_148, l5, latin5                                 |
+|      johab      |                                                          
cp1361, ms1361                                                          |
+|      koi8_r     |                                                            
 cskoi8r                                                              |
+|      kz1048     |                                                  kz_1048, 
rk1048, strk1048_2002                                                  |
+|     latin_1     |         8859, cp819, csisolatin1, ibm819, iso8859, 
iso8859_1, iso_8859_1, iso_8859_1_1987, iso_ir_100, l1, latin, latin1         |
+|   mac_cyrillic  |                                                           
maccyrillic                                                            |
+|    mac_greek    |                                                            
 macgreek                                                             |
+|   mac_iceland   |                                                            
maciceland                                                            |
+|    mac_latin2   |                                                   
maccentraleurope, maclatin2                                                    |
+|    mac_roman    |                                                       
macintosh, macroman                                                        |
+|   mac_turkish   |                                                            
macturkish                                                            |
+|       mbcs      |                                                            
ansi, dbcs                                                            |
+|     ptcp154     |                                             csptcp154, 
pt154, cp154, cyrillic_asian                                              |
+|   quopri_codec  |                                            quopri, 
quoted_printable, quotedprintable                                             |
+|      rot_13     |                                                            
  rot13                                                               |
+|    shift_jis    |                                        csshiftjis, 
shiftjis, sjis, s_jis, x_mac_japanese                                         |
+|  shift_jis_2004 |                                               
shiftjis2004, sjis_2004, s_jis_2004                                             
   |
+|  shift_jisx0213 |                                               
shiftjisx0213, sjisx0213, s_jisx0213                                            
   |
+|      tactis     |                                                            
  tis260                                                              |
+|     tis_620     |                                  tis620, tis_620_0, 
tis_620_2529_0, tis_620_2529_1, iso_ir_166                                   |
+|      utf_16     |                                                            
u16, utf16                                                            |
+|    utf_16_be    |                                                   
unicodebigunmarked, utf_16be                                                   |
+|    utf_16_le    |                                                 
unicodelittleunmarked, utf_16le                                                 
 |
+|      utf_32     |                                                            
u32, utf32                                                            |
+|    utf_32_be    |                                                            
 utf_32be                                                             |
+|    utf_32_le    |                                                            
 utf_32le                                                             |
+|      utf_7      |                                                   u7, 
utf7, unicode_1_1_utf_7                                                    |
+|      utf_8      |                                               u8, utf, 
utf8, utf8_ucs2, utf8_ucs4                                                |
++-----------------+----------------------------------------------------------------------------------------------------------------------------------+
+
+-------
+Supported Languages
+-------
+
+Those language can be detected inside your content. All of these are specified 
in ./charset_normalizer/assets/frequencies.json .
+
+['English',
+ 'German',
+ 'French',
+ 'Dutch',
+ 'Italian',
+ 'Polish',
+ 'Spanish',
+ 'Russian',
+ 'Japanese',
+ 'Portuguese',
+ 'Swedish',
+ 'Chinese',
+ 'Catalan',
+ 'Ukrainian',
+ 'Norwegian',
+ 'Finnish',
+ 'Vietnamese',
+ 'Czech',
+ 'Hungarian',
+ 'Korean',
+ 'Indonesian',
+ 'Turkish',
+ 'Romanian',
+ 'Farsi',
+ 'Arabic',
+ 'Danish',
+ 'Esperanto',
+ 'Serbian',
+ 'Lithuanian',
+ 'Slovene',
+ 'Slovak',
+ 'Malay',
+ 'Hebrew',
+ 'Bulgarian',
+ 'Kazakh',
+ 'Baque',
+ 'Volapük',
+ 'Croatian',
+ 'Hindi',
+ 'Estonian',
+ 'Azeri',
+ 'Galician',
+ 'Simple English',
+ 'Nynorsk',
+ 'Thai',
+ 'Greek',
+ 'Macedonian',
+ 'Serbocroatian',
+ 'Tamil',
+ 'Classical Chinese']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/setup.py 
new/charset_normalizer-1.3.0/setup.py
--- old/charset_normalizer-1.1.1/setup.py       2019-09-23 14:45:48.000000000 
+0200
+++ new/charset_normalizer-1.3.0/setup.py       2019-09-30 20:01:29.000000000 
+0200
@@ -13,17 +13,19 @@
 EMAIL = '[email protected]'
 AUTHOR = 'Ahmed TAHRI @Ousret'
 REQUIRES_PYTHON = '>=3.5.0'
-VERSION = '1.1.1'
+VERSION = '1.3.0'
 
 REQUIRED = [
     'cached_property',
     'dragonmapper',
     'zhon',
-    'prettytable'
+    'prettytable',
+    'loguru'
 ]
 
 EXTRAS = {
-    'permit to generate frequencies.json': ['requests_html', 'requests'],
+    'LetterFrequency': ['requests_html', 'requests'],
+    'UnicodeDataBackport': ['unicodedata2']
 }
 
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/charset_normalizer-1.1.1/test/test_cli.py 
new/charset_normalizer-1.3.0/test/test_cli.py
--- old/charset_normalizer-1.1.1/test/test_cli.py       2019-09-23 
14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/test/test_cli.py       2019-09-30 
20:01:29.000000000 +0200
@@ -1,6 +1,8 @@
 import unittest
 from charset_normalizer.cli.normalizer import cli_detect, query_yes_no
 from unittest.mock import patch
+from os.path import exists
+from os import remove
 
 
 class TestCommandLineInterface(unittest.TestCase):
@@ -26,6 +28,23 @@
             )
         )
 
+    def test_single_file_normalize(self):
+        self.assertEqual(
+            0,
+            cli_detect(
+                ['./data/sample.1.ar.srt', '--normalize']
+            )
+        )
+
+        self.assertTrue(
+            exists('./data/sample.1.ar.cp1256.srt')
+        )
+
+        try:
+            remove('./data/sample.1.ar.cp1256.srt')
+        except:
+            pass
+
     def test_single_verbose_file(self):
         self.assertEqual(
             0,

commit python-charset-normalizer for openSUSE:Factory

Reply via email to