Hello community,
here is the log from the commit of package python-charset-normalizer for
openSUSE:Factory checked in at 2019-10-16 09:12:25
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-charset-normalizer (Old)
and /work/SRC/openSUSE:Factory/.python-charset-normalizer.new.2352 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-charset-normalizer"
Wed Oct 16 09:12:25 2019 rev:3 rq:734952 version:1.3.0
Changes:
--------
---
/work/SRC/openSUSE:Factory/python-charset-normalizer/python-charset-normalizer.changes
2019-09-27 14:51:55.192223149 +0200
+++
/work/SRC/openSUSE:Factory/.python-charset-normalizer.new.2352/python-charset-normalizer.changes
2019-10-16 09:12:29.231900258 +0200
@@ -1,0 +2,14 @@
+Fri Oct 4 08:52:51 UTC 2019 - Marketa Calabkova <[email protected]>
+
+- Update to 1.3.0
+ * Backport unicodedata for v12 impl into python if available
+ * Add aliases to CharsetNormalizerMatches class
+ * Add feature preemptive behaviour, looking for encoding declaration
+ * Add method to determine if specific encoding is multi byte
+ * Add has_submatch property on a match
+ * Add percent_chaos and percent_coherence
+ * Coherence ratio based on mean instead of sum of best results
+ * Using loguru for trace/debug <3
+ * from_byte method improved
+
+-------------------------------------------------------------------
Old:
----
charset_normalizer-1.1.1.tar.gz
New:
----
charset_normalizer-1.3.0.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ python-charset-normalizer.spec ++++++
--- /var/tmp/diff_new_pack.y4kTmJ/_old 2019-10-16 09:12:29.787898825 +0200
+++ /var/tmp/diff_new_pack.y4kTmJ/_new 2019-10-16 09:12:29.791898815 +0200
@@ -20,11 +20,10 @@
# https://github.com/Ousret/charset_normalizer/issues/1
%define skip_python2 1
Name: python-charset-normalizer
-Version: 1.1.1
+Version: 1.3.0
Release: 0
Summary: Python Universal Charset detector
License: MIT
-Group: Development/Languages/Python
URL: https://github.com/ousret/charset_normalizer
Source:
https://github.com/Ousret/charset_normalizer/archive/%{version}.tar.gz#/charset_normalizer-%{version}.tar.gz
BuildRequires: %{python_module setuptools}
@@ -34,6 +33,7 @@
Requires: python-PrettyTable
Requires: python-cached-property
Requires: python-dragonmapper
+Requires: python-loguru
Requires: python-zhon
Suggests: python-requests-html
BuildArch: noarch
@@ -41,6 +41,7 @@
BuildRequires: %{python_module PrettyTable}
BuildRequires: %{python_module cached-property}
BuildRequires: %{python_module dragonmapper}
+BuildRequires: %{python_module loguru}
BuildRequires: %{python_module pytest-runner}
BuildRequires: %{python_module zhon}
# /SECTION
++++++ charset_normalizer-1.1.1.tar.gz -> charset_normalizer-1.3.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/README.md
new/charset_normalizer-1.3.0/README.md
--- old/charset_normalizer-1.1.1/README.md 2019-09-23 14:45:48.000000000
+0200
+++ new/charset_normalizer-1.3.0/README.md 2019-09-30 20:01:29.000000000
+0200
@@ -6,7 +6,9 @@
<img
src="https://travis-ci.org/Ousret/charset_normalizer.svg?branch=master"/>
</a>
<img
src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue"
/>
- <img src="https://img.shields.io/pypi/dm/charset_normalizer.svg"/>
+ <a href="https://pepy.tech/project/charset-normalizer/">
+ <img alt="Download Count /Month"
src="https://pepy.tech/badge/charset-normalizer/month"/>
+ </a>
<a href="https://github.com/ousret/charset_normalizer/blob/master/LICENSE">
<img alt="License: MIT"
src="https://img.shields.io/badge/license-MIT-purple.svg" target="_blank" />
</a>
@@ -16,6 +18,7 @@
<a href="https://codecov.io/gh/Ousret/charset_normalizer">
<img
src="https://codecov.io/gh/Ousret/charset_normalizer/branch/master/graph/badge.svg"
/>
</a>
+ <img alt="Download Count Total"
src="https://pepy.tech/badge/charset-normalizer" />
</p>
> Library that help you read text from unknown charset encoding.<br /> Project
> motivated by `chardet`,
@@ -103,9 +106,8 @@
## 😇 Why
-When I started using Chardet, I noticed that this library was wrong most of
the time
-when it's not about Unicode, Gb or Big5. That because some charset are easily
identifiable
-because of there standards and Chardet does a really good job at identifying
them.
+When I started using Chardet, I noticed that this library was unreliable
nowadays and also
+it's unmaintained, and most likely will never be.
I **don't care** about the **originating charset** encoding, that because
**two different table** can
produce **two identical file.**
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/charset_normalizer-1.1.1/charset_normalizer/__init__.py
new/charset_normalizer-1.3.0/charset_normalizer/__init__.py
--- old/charset_normalizer-1.1.1/charset_normalizer/__init__.py 2019-09-23
14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/__init__.py 2019-09-30
20:01:29.000000000 +0200
@@ -1,7 +1,9 @@
# coding: utf-8
-from charset_normalizer.normalizer import CharsetNormalizerMatches,
CharsetNormalizerMatch
+from charset_normalizer.normalizer import CharsetNormalizerMatches,
CharsetNormalizerMatch, \
+ CharsetDetector, CharsetDoctor, EncodingDetector # Aliases
from charset_normalizer.unicode import UnicodeRangeIdentify
from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.probe_coherence import ProbeCoherence
from charset_normalizer.probe_words import ProbeWords
from charset_normalizer.legacy import detect
+from charset_normalizer.hook import charset_normalizer_hook
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/charset_normalizer-1.1.1/charset_normalizer/constant.py
new/charset_normalizer-1.3.0/charset_normalizer/constant.py
--- old/charset_normalizer-1.1.1/charset_normalizer/constant.py 2019-09-23
14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/constant.py 2019-09-30
20:01:29.000000000 +0200
@@ -4,6 +4,9 @@
Scrapped from https://unicode-table.com/
"""
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE,
BOM_UTF32_LE
+from _multibytecodec import MultibyteIncrementalDecoder
+
+MULTI_BYTE_DECODER = MultibyteIncrementalDecoder
UNICODE_RANGES = [
"0000−001F",
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/charset_normalizer-1.1.1/charset_normalizer/encoding.py
new/charset_normalizer-1.3.0/charset_normalizer/encoding.py
--- old/charset_normalizer-1.1.1/charset_normalizer/encoding.py 1970-01-01
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/charset_normalizer/encoding.py 2019-09-30
20:01:29.000000000 +0200
@@ -0,0 +1,15 @@
+from charset_normalizer.constant import MULTI_BYTE_DECODER
+import importlib
+
+
+def is_multi_byte_encoding(encoding_name):
+ """
+ Verify is a specific encoding is a multi byte one based on it IANA name
+ :param str encoding_name: IANA encoding name
+ :return: True if multi byte
+ :rtype: bool
+ """
+ return issubclass(
+
importlib.import_module('encodings.{encoding_name}'.format(encoding_name=encoding_name)).IncrementalDecoder,
+ MULTI_BYTE_DECODER
+ )
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/charset_normalizer/hook.py
new/charset_normalizer-1.3.0/charset_normalizer/hook.py
--- old/charset_normalizer-1.1.1/charset_normalizer/hook.py 1970-01-01
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/charset_normalizer/hook.py 2019-09-30
20:01:29.000000000 +0200
@@ -0,0 +1,20 @@
+import sys
+from charset_normalizer.legacy import detect
+
+
+def charset_normalizer_hook(exctype, value, traceback):
+ if exctype == UnicodeDecodeError:
+ cp_detection = detect(value.object)
+ if cp_detection['encoding'] is not None:
+ value.reason = value.reason+'; you may want to consider {} codec
for this sequence.'.format(cp_detection['encoding'])
+
+ sys.__excepthook__(exctype, value, traceback)
+
+
+sys.excepthook = charset_normalizer_hook
+
+try:
+ import unicodedata2
+ sys.modules['unicodedata'] = unicodedata2
+except ImportError:
+ pass
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/charset_normalizer-1.1.1/charset_normalizer/normalizer.py
new/charset_normalizer-1.3.0/charset_normalizer/normalizer.py
--- old/charset_normalizer-1.1.1/charset_normalizer/normalizer.py
2019-09-23 14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/normalizer.py
2019-09-30 20:01:29.000000000 +0200
@@ -13,6 +13,11 @@
from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter
+from charset_normalizer.encoding import is_multi_byte_encoding
+
+from charset_normalizer.probe_inherent_sign import any_specified_encoding
+
+from loguru import logger
from hashlib import sha256
@@ -62,6 +67,15 @@
"""
return self._submatch
+ @property
+ def has_submatch(self):
+ """
+ Determine if current match has any other match linked to it.
+ :return: True if any sub match available
+ :rtype: bool
+ """
+ return len(self._submatch) > 0
+
@cached_property
def alphabets(self):
"""
@@ -85,6 +99,8 @@
:param CharsetNormalizerMatch other:
:return:
"""
+ if not isinstance(other, CharsetNormalizerMatch):
+ raise TypeError('__eq__ cannot be invoked on {} and
{}.'.format(str(other.__class__), str(self.__class__)))
return self.fingerprint == other.fingerprint and self.encoding ==
other.encoding
@cached_property
@@ -137,6 +153,25 @@
"""
return self._chaos_ratio
+ @property
+ def percent_chaos(self):
+ """
+ Convert chaos ratio to readable percentage with ndigits=3
+ from 0.000 % to 100.000 %
+ :return: float
+ """
+ return round(self._chaos_ratio * 100, ndigits=3)
+
+ @property
+ def percent_coherence(self):
+ """
+ Convert coherence ratio to readable percentage with ndigits=3
+ from 0.000 % to 100.000 %
+ :return: float
+ :rtype: float
+ """
+ return round((1 - self.coherence) * 100, ndigits=3)
+
@cached_property
def chaos_secondary_pass(self):
"""
@@ -286,7 +321,7 @@
return b_
@staticmethod
- def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
+ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20,
cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
"""
Take a sequence of bytes that could potentially be decoded to str and
discard all obvious non supported
charset encoding.
@@ -294,10 +329,16 @@
:param bytes sequences: Actual sequence of bytes to analyse
:param float threshold: Maximum amount of chaos allowed on first pass
:param int chunk_size: Size to extract and analyse in each step
- :param int steps: Number of steps
+ :param int steps: Number of steps/block to extract from sequence
+ :param bool preemptive_behaviour: Determine if we should look into
sequence (ASCII-Mode) for pre-defined encoding
+ :param bool explain: Print on screen what is happening when searching
for a match
+ :param list[str] cp_isolation: Finite list of encoding to use when
searching for a match
+ :param list[str] cp_exclusion: Finite list of encoding to avoid when
searching for a match
:return: List of potential matches
:rtype: CharsetNormalizerMatches
"""
+ if not explain:
+ logger.disable('charset_normalizer')
too_small_sequence = len(sequences) < 24
@@ -308,13 +349,32 @@
# Adjust steps and chunk_size when content is just too small for it
if maximum_length <= (chunk_size * steps):
+ logger.warning(
+ 'override steps and chunk_size as content does not fit
parameters.',
+ chunk_size=chunk_size, steps=steps, seq_len=maximum_length)
steps = 1
-
- if maximum_length <= chunk_size:
chunk_size = maximum_length
- elif steps > 1 and maximum_length / steps < chunk_size:
+
+ if steps > 1 and maximum_length / steps < chunk_size:
chunk_size = int(maximum_length / steps)
+ if cp_isolation is not None and isinstance(cp_isolation, list) is
False:
+ raise TypeError('cp_isolation must be None or list')
+
+ if cp_exclusion is not None and isinstance(cp_exclusion, list) is
False:
+ raise TypeError('cp_exclusion must be None or list')
+
+ if cp_isolation is not None:
+ logger.warning('cp_isolation is set. use this flag for debugging
purpose. '
+ 'limited list of encoding allowed :
{allowed_list}.',
+ allowed_list=', '.join(cp_isolation))
+
+ if cp_exclusion is not None:
+ logger.warning(
+ 'cp_exclusion is set. use this flag for debugging purpose. '
+ 'limited list of encoding excluded : {excluded_list}.',
+ excluded_list=', '.join(cp_exclusion))
+
# Bellow Python 3.6, Expect dict to not behave the same.
py_v = [int(el) for el in python_version_tuple()]
py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)
@@ -324,10 +384,23 @@
tested = set()
matches = list()
+ specified_encoding = any_specified_encoding(sequences) if
preemptive_behaviour is True else None
+
+ if specified_encoding is not None:
+ warn(
+ 'Trying to detect encoding on a sequence that seems to declare
a encoding ({}).'.format(specified_encoding)
+ )
+
for support in supported:
k, p = support
+ if cp_isolation is not None and p not in cp_isolation:
+ continue
+
+ if cp_exclusion is not None and p in cp_exclusion:
+ continue
+
if p in tested:
continue
@@ -347,24 +420,36 @@
if any(bom_c_list) is True:
bom_available = True
bom_len =
len(BYTE_ORDER_MARK[p][bom_c_list.index(True)])
+ if bom_available is True:
+ logger.info('{encoding} has a SIG or BOM mark on first
{n_byte} byte(s). Adding chaos bonus.', encoding=p, n_byte=bom_len)
str(
sequences if bom_available is False else
sequences[bom_len:],
encoding=p
)
- except UnicodeDecodeError:
+ except UnicodeDecodeError as e:
+ logger.debug('{encoding} does not fit given bytes sequence at
ALL. {explanation}', encoding=p, explanation=str(e))
continue
except LookupError:
continue
+ is_multi_byte_enc = is_multi_byte_encoding(p)
+
+ if is_multi_byte_enc is True:
+ logger.info('{encoding} is a multi byte encoding table. '
+ 'Should not be a coincidence. Adding chaos bonus.',
+ encoding=p)
+ else:
+ logger.debug('{encoding} is a single byte encoding table.',
encoding=p)
+
r_ = range(
0 if bom_available is False else bom_len,
maximum_length,
int(maximum_length / steps)
)
- measures = [ProbeChaos(str(sequences[i:i + chunk_size],
encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_]
+ measures = [ProbeChaos(str(sequences[i:i + chunk_size],
encoding=p, errors='ignore'), giveup_threshold=threshold,
bonus_bom_sig=bom_available, bonus_multi_byte=is_multi_byte_enc) for i in r_]
ratios = [el.ratio for el in measures]
nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el
in measures].count(True)
@@ -373,8 +458,13 @@
# chaos_min = min(ratios)
# chaos_max = max(ratios)
- if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median >
threshold:
- # print(p, 'is too much chaos for decoded input !',
nb_gave_up, chaos_median)
+ if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_means >
threshold:
+ logger.warning('{encoding} was excluded because of initial
chaos probing. '
+ 'Gave up {nb_gave_up} time(s). '
+ 'Computed median chaos is {chaos_median}
%.',
+ encoding=p,
+ nb_gave_up=nb_gave_up,
+ chaos_median=round(chaos_means*100,
ndigits=3))
continue
encountered_unicode_range_occurrences = dict()
@@ -385,8 +475,6 @@
encountered_unicode_range_occurrences[u_name] = 0
encountered_unicode_range_occurrences[u_name] += u_occ
- # print(p, 'U RANGES', encountered_unicode_range_occurrences)
-
cnm = CharsetNormalizerMatch(
sequences if not bom_available else sequences[bom_len:],
p,
@@ -395,55 +483,84 @@
bom_available
)
+ logger.info(
+ '{encoding} passed initial chaos probing. '
+ 'Measured chaos is {chaos_means} % and coherence is
{coherence} %. '
+ 'It seems to be written in {language}.',
+ encoding=p,
+ chaos_means=round(chaos_means*100, ndigits=3),
+ coherence=cnm.percent_coherence,
+ language=cnm.languages
+ )
+
fingerprint_tests = [el.fingerprint == cnm.fingerprint for el in
matches]
if any(fingerprint_tests) is True:
matches[fingerprint_tests.index(True)].submatch.append(cnm)
+ logger.debug('{encoding} is marked as a submatch of
{primary_encoding}.', encoding=cnm.encoding,
primary_encoding=matches[fingerprint_tests.index(True)].encoding)
else:
matches.append(
- CharsetNormalizerMatch(
- sequences if not bom_available else
sequences[bom_len:],
- p,
- chaos_means,
- encountered_unicode_range_occurrences,
- bom_available
- )
+ cnm
)
- # print(p, nb_gave_up, chaos_means, chaos_median,
matches[-1].coherence, matches[-1].languages,)
+ if specified_encoding is not None and p == specified_encoding:
+ logger.info('{encoding} is most likely the one. '
+ 'Because it is specified in analysed byte sequence
and '
+ 'initial test passed successfully. '
+ 'Disable this behaviour by setting
preemptive_behaviour '
+ 'to False', encoding=specified_encoding)
+ return CharsetNormalizerMatches([cnm]) if
any(fingerprint_tests) is False else
CharsetNormalizerMatches([matches[fingerprint_tests.index(True)]])
if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
+ logger.info('{encoding} is most likely the one.
{bom_available}',
+ encoding=p,
+ bom_available='BOM/SIG available' if
bom_available else '')
+
return CharsetNormalizerMatches([matches[-1]])
return CharsetNormalizerMatches(matches)
@staticmethod
- def from_fp(fp, steps=10, chunk_size=512, threshold=0.20):
+ def from_fp(fp, steps=10, chunk_size=512, threshold=0.20,
cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
"""
:param io.BinaryIO fp:
:param int steps:
:param int chunk_size:
:param float threshold:
- :return:
+ :param bool explain: Print on screen what is happening when searching
for a match
+ :param bool preemptive_behaviour: Determine if we should look into
sequence (ASCII-Mode) for pre-defined encoding
+ :param list[str] cp_isolation: Finite list of encoding to use when
searching for a match
+ :param list[str] cp_exclusion: Finite list of encoding to avoid when
searching for a match
+ :return: List of potential matches
+ :rtype: CharsetNormalizerMatches
"""
return CharsetNormalizerMatches.from_bytes(
bytearray(fp.read()),
steps,
chunk_size,
- threshold
+ threshold,
+ cp_isolation,
+ cp_exclusion,
+ preemptive_behaviour,
+ explain
)
@staticmethod
- def from_path(path, steps=10, chunk_size=512, threshold=0.20):
+ def from_path(path, steps=10, chunk_size=512, threshold=0.20,
cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
"""
:param str path:
:param int steps:
:param int chunk_size:
:param float threshold:
- :return:
+ :param bool preemptive_behaviour: Determine if we should look into
sequence (ASCII-Mode) for pre-defined encoding
+ :param bool explain: Print on screen what is happening when searching
for a match
+ :param list[str] cp_isolation: Finite list of encoding to use when
searching for a match
+ :param list[str] cp_exclusion: Finite list of encoding to avoid when
searching for a match
+ :return: List of potential matches
+ :rtype: CharsetNormalizerMatches
"""
with open(path, 'rb') as fp:
- return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size,
threshold)
+ return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size,
threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain)
@cached_property
def could_be_from_charset(self):
@@ -470,42 +587,39 @@
:rtype: CharsetNormalizerMatches | CharsetNormalizerMatch
"""
- lowest_ratio = None
- lowest_ratio_frequency = None
+ if len(self) == 0:
+ logger.error('Trying to call best() on empty
CharsetNormalizerMatches, that is sad.')
+ return CharsetNormalizerMatches(self._matches)
+ elif len(self) == 1:
+ logger.debug('best() is not required because there is only one
match in it.')
+ return CharsetNormalizerMatches(self._matches)
- match_per_ratio = dict()
- match_per_frequency_letter = dict()
+ logger.info('We need to choose between {nb_suitable_match} match.
Order By Chaos Then Coherence.', nb_suitable_match=len(self))
- for match in self._matches:
+ sorted_matches = sorted(self._matches, key=lambda x: x.chaos)
- if match.chaos not in match_per_ratio.keys():
- match_per_ratio[match.chaos] = list()
+ nb_lowest_ratio = [el.chaos <= sorted_matches[0].chaos * 1.2 for el in
sorted_matches[1:]].count(True)
- match_per_ratio[match.chaos].append(match)
+ logger.info('Lowest Chaos found is {lowest_chaos} %. Reduced list to
{nb_suitable_match} match.', lowest_chaos=sorted_matches[0].percent_chaos,
nb_suitable_match=nb_lowest_ratio+1)
- if lowest_ratio is None or lowest_ratio > match.chaos:
- lowest_ratio = match.chaos
+ if nb_lowest_ratio+1 > 1:
+ logger.info('Order By Chaos is not enough, {nb_suitable_match}
remaining. Next, ordering by Coherence.', nb_suitable_match=nb_lowest_ratio+1)
- if lowest_ratio is None:
- return CharsetNormalizerMatches([])
+ sorted_matches_second_pass =
sorted(sorted_matches[:nb_lowest_ratio+1], key=lambda x: x.coherence)
+ nb_lowest_ratio = [el.coherence ==
sorted_matches_second_pass[0].coherence for el in
sorted_matches_second_pass[1:]].count(True)
- all_latin_basic = True
+ logger.info('Highest Coherence found is {lowest_chaos} %. Reduced
list to {nb_suitable_match} match.',
lowest_chaos=sorted_matches_second_pass[0].percent_coherence,
nb_suitable_match=nb_lowest_ratio+1)
- for match in match_per_ratio[lowest_ratio]: # type:
CharsetNormalizerMatch
- secondary_ratio = match.coherence
-
- if lowest_ratio_frequency is None or lowest_ratio_frequency >
secondary_ratio:
- lowest_ratio_frequency = secondary_ratio
-
- if secondary_ratio not in match_per_frequency_letter.keys():
- match_per_frequency_letter[secondary_ratio] = list()
-
- match_per_frequency_letter[secondary_ratio].append(match)
+ return CharsetNormalizerMatches(
+ sorted_matches_second_pass[:nb_lowest_ratio+1]
+ )
- if len(match.alphabets) != 1 or match.alphabets[0] != 'Basic
Latin':
- all_latin_basic = False
+ return CharsetNormalizerMatches(
+ sorted_matches[:nb_lowest_ratio+1]
+ )
- if all_latin_basic is True:
- return
CharsetNormalizerMatches(match_per_frequency_letter[lowest_ratio_frequency]).first()
- return
CharsetNormalizerMatches(match_per_frequency_letter[lowest_ratio_frequency]) if
len(match_per_frequency_letter[lowest_ratio_frequency]) > 1 else
CharsetNormalizerMatches(match_per_frequency_letter[lowest_ratio_frequency]).first()
+# Some aliases to CharsetNormalizerMatches, because it is too long for a class
name.
+CharsetDetector = CharsetNormalizerMatches
+EncodingDetector = CharsetNormalizerMatches
+CharsetDoctor = CharsetNormalizerMatches
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/charset_normalizer-1.1.1/charset_normalizer/probe_chaos.py
new/charset_normalizer-1.3.0/charset_normalizer/probe_chaos.py
--- old/charset_normalizer-1.1.1/charset_normalizer/probe_chaos.py
2019-09-23 14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/probe_chaos.py
2019-09-30 20:01:29.000000000 +0200
@@ -14,10 +14,12 @@
@lru_cache(maxsize=8192)
class ProbeChaos:
- def __init__(self, string, giveup_threshold=0.09):
+ def __init__(self, string, giveup_threshold=0.09, bonus_bom_sig=False,
bonus_multi_byte=False):
"""
:param str string:
:param float giveup_threshold: When to give up even if _probe has not
finished yet
+ :param bool bonus_bom_sig: Decide if ratio should take in
consideration a bonus because of BOM/SIG
+ :param bool bonus_multi_byte: Decide if ratio should take in
consideration a bonus because of multi byte scheme decoder
"""
if not isinstance(string, str):
@@ -26,6 +28,9 @@
self._string = string
self._threshold = giveup_threshold
+ self._bonus_bom_sig = bonus_bom_sig
+ self._bonus_multi_byte = bonus_multi_byte
+
self.successive_upper_lower = 0
self.successive_accent = 0
self.successive_different_unicode_range = 0
@@ -46,14 +51,18 @@
self.total_letter_encountered = 0
self.total_lower_letter_encountered = 0
+ self.total_upper_letter_encountered = 0
+
self.total_upper_accent_encountered = 0
self.total_upper_accent_encountered_inner = 0
+
self.total_unaccented_letter_encountered = 0
self._probe_word = ProbeWords(HashableCounter(self._string.split()))
self.gave_up = False
+ # Artificially increase string size to get more significant result.
if 32 > len(self._string) > 0:
self._string *= int(32 / len(self._string)) + 1
@@ -165,6 +174,9 @@
if is_lower:
self.total_lower_letter_encountered += 1
+ if is_upper:
+ self.total_upper_letter_encountered += 1
+
if is_upper and is_accent:
self.total_upper_accent_encountered += 1
if self.previous_printable_letter.isalpha():
@@ -237,7 +249,14 @@
:return: Ratio as floating number
:rtype: float
"""
- r_ = self.total_upper_accent_encountered if
self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered
/ self.total_letter_encountered < 0.5 else 0
+
+ r_ = self.total_upper_accent_encountered if
self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5
else 0
+ q_ = self.total_upper_letter_encountered / 3 if
self.total_upper_letter_encountered > self.total_lower_letter_encountered * 0.4
else 0
z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(len(self._string),
self.encountered_unicode_range_occurrences)
p_ = self.encountered_punc_sign if self.encountered_punc_sign /
len(self._string) > 0.2 else 0
- return ((r_ + p_ + self.successive_upper_lower +
self.successive_accent + self.successive_different_unicode_range +
self.not_encountered_white_space + self.unprintable + z_ +
ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string,
self.encountered_unicode_range_occurrences)) / len(self._string)) +
self._probe_word.ratio # + len(self.encountered_unicode_range)-1
+
+ bonus_sig_bom = -int(len(self._string)*0.5) if self._bonus_bom_sig is
True else 0
+
+ initial_ratio = ((r_ + p_ + q_ + self.successive_upper_lower +
self.successive_accent + self.successive_different_unicode_range +
self.not_encountered_white_space + self.unprintable + z_ + bonus_sig_bom +
ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string,
self.encountered_unicode_range_occurrences)) / len(self._string)) +
self._probe_word.ratio # + len(self.encountered_unicode_range)-1
+
+ return initial_ratio / 1.3 if self._bonus_multi_byte is True and
initial_ratio > 0. else initial_ratio
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/charset_normalizer-1.1.1/charset_normalizer/probe_coherence.py
new/charset_normalizer-1.3.0/charset_normalizer/probe_coherence.py
--- old/charset_normalizer-1.1.1/charset_normalizer/probe_coherence.py
2019-09-23 14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/probe_coherence.py
2019-09-30 20:01:29.000000000 +0200
@@ -1,5 +1,6 @@
# coding: utf-8
import json
+import statistics
from collections import Counter
from functools import lru_cache
from os.path import dirname, realpath, exists
@@ -85,7 +86,7 @@
ratios = [self.rank_per_lang[lg] for lg in languages]
- return sum(ratios) / 2 if self.non_latin_covered_any is True else
sum(ratios)
+ return statistics.mean(ratios) / 2 if self.non_latin_covered_any is
True else statistics.mean(ratios)
@property
def coverage(self):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/charset_normalizer-1.1.1/charset_normalizer/probe_inherent_sign.py
new/charset_normalizer-1.3.0/charset_normalizer/probe_inherent_sign.py
--- old/charset_normalizer-1.1.1/charset_normalizer/probe_inherent_sign.py
1970-01-01 01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/charset_normalizer/probe_inherent_sign.py
2019-09-30 20:01:29.000000000 +0200
@@ -0,0 +1,39 @@
+from re import findall, compile, IGNORECASE
+from encodings.aliases import aliases
+
+RE_POSSIBLE_ENCODING_INDICATION = compile(
+ r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:=
]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',
+ IGNORECASE
+)
+
+
+def any_specified_encoding(sequence):
+ """
+ Search in sequence (ASCII-mode) if there is any sign of declared encoding.
+ :param bytes sequence:
+ :return: Declared encoding if any else None
+ :rtype: str
+ """
+ if not isinstance(sequence, bytes) and not isinstance(sequence, bytearray):
+ raise TypeError
+
+ seq_len = len(sequence)
+
+ results = findall(
+ RE_POSSIBLE_ENCODING_INDICATION,
+ sequence[:seq_len if seq_len <= 2048 else
int(seq_len*0.3)].decode('ascii', errors='ignore')
+ ) # type: list[str]
+
+ if len(results) == 0:
+ return None
+
+ for specified_encoding in results:
+ specified_encoding = specified_encoding.lower().replace('-', '_')
+
+ for a, b in aliases.items():
+ if a == specified_encoding:
+ return b
+ if b == specified_encoding:
+ return b
+
+ return None
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/charset_normalizer-1.1.1/charset_normalizer/unicode.py
new/charset_normalizer-1.3.0/charset_normalizer/unicode.py
--- old/charset_normalizer-1.1.1/charset_normalizer/unicode.py 2019-09-23
14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/charset_normalizer/unicode.py 2019-09-30
20:01:29.000000000 +0200
@@ -92,6 +92,8 @@
items = encountered_unicode_range_occurrences.items()
s_ = 0
+ # print(encountered_unicode_range_occurrences)
+
for k, v in items:
k_ = k.lower()
if (
@@ -101,7 +103,10 @@
continue
if 'halfwidth and fullwidth forms' in k_ and any(['CJK' in
el for el in encountered_unicode_range_occurrences.keys()]):
continue
- s_ += v if 'geometric shapes' not in k_ else v * 10
+ if 'hiragana' in k_ or 'katakana' in k_:
+ continue
+ # print('suspicious', k_, 'with', v)
+ s_ += v
return s_
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/Makefile
new/charset_normalizer-1.3.0/docs/Makefile
--- old/charset_normalizer-1.1.1/docs/Makefile 1970-01-01 01:00:00.000000000
+0100
+++ new/charset_normalizer-1.3.0/docs/Makefile 2019-09-30 20:01:29.000000000
+0200
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = python -msphinx
+SPHINXPROJ = Charset Normalizer
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/advanced_search.rst
new/charset_normalizer-1.3.0/docs/advanced_search.rst
--- old/charset_normalizer-1.1.1/docs/advanced_search.rst 1970-01-01
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/advanced_search.rst 2019-09-30
20:01:29.000000000 +0200
@@ -0,0 +1,20 @@
+Advanced Search
+===============
+
+Charset Normalizer method ``from_bytes``, ``from_fp`` and ``from_path``
provide some
+optional parameters that can be tweaked.
+
+As follow ::
+
+ CharsetDetector.from_bytes(
+ my_byte_str,
+ steps=10, # Number of steps/block to extract from my_byte_str
+ chunk_size=512, # Set block size of each extraction
+ threshold=0.2, # Maximum amount of chaos allowed on first pass
+ cp_isolation=None, # Finite list of encoding to use when searching
for a match
+ cp_exclusion=None, # Finite list of encoding to avoid when searching
for a match
+ preemptive_behaviour=True, # Determine if we should look into
my_byte_str (ASCII-Mode) for pre-defined encoding
+ explain=False # Print on screen what is happening when searching for
a match
+ )
+
+!! Warning !! Work in Progress Documentation !!
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/conf.py
new/charset_normalizer-1.3.0/docs/conf.py
--- old/charset_normalizer-1.1.1/docs/conf.py 1970-01-01 01:00:00.000000000
+0100
+++ new/charset_normalizer-1.3.0/docs/conf.py 2019-09-30 20:01:29.000000000
+0200
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# charset-normalizer documentation build configuration file, created by
+# sphinx-quickstart on Fri Jun 16 04:30:35 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+from recommonmark.parser import CommonMarkParser
+import sphinx_rtd_theme
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+# source_suffix = '.rst'
+
+source_parsers = {
+ '.md': CommonMarkParser,
+}
+
+source_suffix = ['.rst', '.md']
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'charset_normalizer'
+copyright = '2019, Ahmed TAHRI'
+author = 'Ahmed TAHRI @Ousret'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.1'
+# The full version, including alpha/beta/rc tags.
+release = '1.1.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'charset-normalizer-doc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #
+ # 'papersize': 'letterpaper',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #
+ # 'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #
+ # 'preamble': '',
+
+ # Latex figure (float) alignment
+ #
+ # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, 'charset-normalizer.tex', 'Charset Normalizer Documentation',
+ 'Ahmed TAHRI', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ (master_doc, 'charset-normalizer', 'Charset Normalizer Documentation',
+ [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (master_doc, 'Charset Normalizer', 'Charsert Normalizer Documentation',
+ author, 'charset-normalizer', '🎁 Maintained library on encoding &
language detection. 🚀No Cpp Bindings, Using Voodoo and Magical Artifacts. 🔎
Like Chardet',
+ 'Miscellaneous'),
+]
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/getstarted.rst
new/charset_normalizer-1.3.0/docs/getstarted.rst
--- old/charset_normalizer-1.1.1/docs/getstarted.rst 1970-01-01
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/getstarted.rst 2019-09-30
20:01:29.000000000 +0200
@@ -0,0 +1,70 @@
+Installation
+============
+
+This installs a package that can be used from Python (``import
charset_normalizer``).
+
+To install for all users on the system, administrator rights (root)
+may be required.
+
+From PyPI
+---------
+Charset Normalizer can be installed from PyPI::
+
+ pip install charset-normalizer
+
+You may enable extra feature Unicode Data v12 backport as follow::
+
+ pip install charset-normalizer[UnicodeDataBackport]
+
+From git via dev-master
+-----------------------
+You can install from dev-master branch using git::
+
+ git clone https://github.com/Ousret/charset_normalizer.git
+ cd charset_normalizer/
+ python setup.py install
+
+Basic Usage
+===========
+
+The new way
+-----------
+
+You may want to get right to it. ::
+
+ from charset_normalizer import CharsetDetector
+
+ # This is going to print out your sequence once encoding has been detected
+ print(
+ CharsetDetector.from_bytes(
+ my_byte_str
+ ).best().first()
+ )
+
+ # You could also want the same from a file
+ print(
+ CharsetDetector.from_path(
+ './data/sample.1.ar.srt'
+ ).best().first()
+ )
+
+
+Backward compatibility
+----------------------
+
+If you were used to python chardet, we are providing the very same
``detect()`` method as chardet.
+
+ ::
+
+ from charset_normalizer import detect
+
+ # This will behave exactly the same as python chardet
+ result = detect(my_byte_str)
+
+ if result['encoding'] is not None:
+ print('got', result['encoding'], 'as detected encoding')
+
+
+You may upgrade your code with ease.
+CTRL + R ``from chardet import detect`` to ``from charset_normalizer import
detect``.
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/handling_result.rst
new/charset_normalizer-1.3.0/docs/handling_result.rst
--- old/charset_normalizer-1.1.1/docs/handling_result.rst 1970-01-01
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/handling_result.rst 2019-09-30
20:01:29.000000000 +0200
@@ -0,0 +1,5 @@
+================
+ Handling Result
+================
+
+!! Warning !! Work in Progress Documentation !!
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/index.rst
new/charset_normalizer-1.3.0/docs/index.rst
--- old/charset_normalizer-1.1.1/docs/index.rst 1970-01-01 01:00:00.000000000
+0100
+++ new/charset_normalizer-1.3.0/docs/index.rst 2019-09-30 20:01:29.000000000
+0200
@@ -0,0 +1,48 @@
+===================
+ Charset Normalizer
+===================
+
+Overview
+========
+
+Library that help you read text from unknown charset encoding.
+Project motivated by chardet, I'm trying to resolve the issue by taking
another approach.
+All IANA character set names for which the Python core library provides codecs
are supported.
+
+.. image::
https://repository-images.githubusercontent.com/200259335/d3da9600-dedc-11e9-83e8-081f597505df
+ :width: 500px
+ :scale: 100 %
+ :alt: CLI Charset Normalizer
+ :align: right
+
+
+It is released under MIT license, see LICENSE for more
+details. Be aware that no warranty of any kind is provided with this package.
+
+Copyright (C) 2019 Ahmed TAHRI @Ousret <ahmed(dot)tahri(at)cloudnursery.dev>
+
+!! Warning !! Work in Progress Documentation !!
+
+Features
+========
+
+- Encoding detection on a buffer, bytes or file.
+- Transpose any encoded content to Unicode the best we can.
+- Detect spoken language in text.
+
+Contents:
+
+.. toctree::
+ :maxdepth: 2
+
+ support
+ getstarted
+ advanced_search
+ handling_result
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
\ No newline at end of file
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/requirements.txt
new/charset_normalizer-1.3.0/docs/requirements.txt
--- old/charset_normalizer-1.1.1/docs/requirements.txt 1970-01-01
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/requirements.txt 2019-09-30
20:01:29.000000000 +0200
@@ -0,0 +1 @@
+sphinx_rtd_theme
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/docs/support.rst
new/charset_normalizer-1.3.0/docs/support.rst
--- old/charset_normalizer-1.1.1/docs/support.rst 1970-01-01
01:00:00.000000000 +0100
+++ new/charset_normalizer-1.3.0/docs/support.rst 2019-09-30
20:01:29.000000000 +0200
@@ -0,0 +1,167 @@
+=================
+ Support
+=================
+
+!! Warning !! Work in Progress Documentation !!
+
+-------
+Supported Encodings
+-------
+
+Charset Normalizer is able to detect any of those encoding.
+
++-----------------+----------------------------------------------------------------------------------------------------------------------------------+
+| IANA Code Page |
Aliases |
++=================+==================================================================================================================================+
+| ascii | 646, ansi_x3.4_1968, ansi_x3_4_1968, ansi_x3.4_1986,
cp367, csascii, ibm367, iso646_us, iso_646.irv_1991, iso_ir_6, us, us_ascii |
+| big5 | big5_tw,
csbig5, x_mac_trad_chinese |
+| big5hkscs |
big5_hkscs, hkscs |
+| cp037 | 037, csibm037, ebcdic_cp_ca,
ebcdic_cp_nl, ebcdic_cp_us, ebcdic_cp_wt, ibm037, ibm039 |
+| cp1026 | 1026,
csibm1026, ibm1026 |
+| cp1125 | 1125,
ibm1125, cp866u, ruscii |
+| cp1140 |
1140, ibm1140 |
+| cp1250 |
1250, windows_1250 |
+| cp1251 |
1251, windows_1251 |
+| cp1252 |
1252, windows_1252 |
+| cp1253 |
1253, windows_1253 |
+| cp1254 |
1254, windows_1254 |
+| cp1255 |
1255, windows_1255 |
+| cp1256 |
1256, windows_1256 |
+| cp1257 |
1257, windows_1257 |
+| cp1258 |
1258, windows_1258 |
+| cp273 | 273,
ibm273, csibm273 |
+| cp424 | 424,
csibm424, ebcdic_cp_he, ibm424 |
+| cp437 | 437,
cspc8codepage437, ibm437 |
+| cp500 | 500, csibm500,
ebcdic_cp_be, ebcdic_cp_ch, ibm500 |
+| cp775 | 775,
cspc775baltic, ibm775 |
+| cp850 | 850,
cspc850multilingual, ibm850 |
+| cp852 | 852,
cspcp852, ibm852 |
+| cp855 | 855,
csibm855, ibm855 |
+| cp857 | 857,
csibm857, ibm857 |
+| cp858 | 858,
csibm858, ibm858 |
+| cp860 | 860,
csibm860, ibm860 |
+| cp861 | 861,
cp_is, csibm861, ibm861 |
+| cp862 | 862,
cspc862latinhebrew, ibm862 |
+| cp863 | 863,
csibm863, ibm863 |
+| cp864 | 864,
csibm864, ibm864 |
+| cp865 | 865,
csibm865, ibm865 |
+| cp866 | 866,
csibm866, ibm866 |
+| cp869 | 869,
cp_gr, csibm869, ibm869 |
+| cp932 | 932,
ms932, mskanji, ms_kanji |
+| cp949 |
949, ms949, uhc |
+| cp950 |
950, ms950 |
+| euc_jis_2004 | jisx0213,
eucjis2004, euc_jis2004 |
+| euc_jisx0213 |
eucjisx0213 |
+| euc_jp |
eucjp, ujis, u_jis |
+| euc_kr | euckr, korean, ksc5601, ks_c_5601,
ks_c_5601_1987, ksx1001, ks_x_1001, x_mac_korean |
+| gb18030 |
gb18030_2000 |
+| gb2312 | chinese, csiso58gb231280, euc_cn, euccn,
eucgb2312_cn, gb2312_1980, gb2312_80, iso_ir_58, x_mac_simp_chinese |
+| gbk |
936, cp936, ms936 |
+| hp_roman8 |
roman8, r8, csHPRoman8 |
+| hz | hzgb,
hz_gb, hz_gb_2312 |
+| iso2022_jp | csiso2022jp,
iso2022jp, iso_2022_jp |
+| iso2022_jp_1 |
iso2022jp_1, iso_2022_jp_1 |
+| iso2022_jp_2 |
iso2022jp_2, iso_2022_jp_2 |
+| iso2022_jp_2004 |
iso_2022_jp_2004, iso2022jp_2004
|
+| iso2022_jp_3 |
iso2022jp_3, iso_2022_jp_3 |
+| iso2022_jp_ext |
iso2022jp_ext, iso_2022_jp_ext
|
+| iso2022_kr | csiso2022kr,
iso2022kr, iso_2022_kr |
+| iso8859_10 | csisolatin6, iso_8859_10,
iso_8859_10_1992, iso_ir_157, l6, latin6 |
+| iso8859_11 | thai,
iso_8859_11, iso_8859_11_2001 |
+| iso8859_13 |
iso_8859_13, l7, latin7 |
+| iso8859_14 | iso_8859_14,
iso_8859_14_1998, iso_celtic, iso_ir_199, l8, latin8
|
+| iso8859_15 |
iso_8859_15, l9, latin9 |
+| iso8859_16 | iso_8859_16,
iso_8859_16_2001, iso_ir_226, l10, latin10
|
+| iso8859_2 | csisolatin2, iso_8859_2,
iso_8859_2_1987, iso_ir_101, l2, latin2 |
+| iso8859_3 | csisolatin3, iso_8859_3,
iso_8859_3_1988, iso_ir_109, l3, latin3 |
+| iso8859_4 | csisolatin4, iso_8859_4,
iso_8859_4_1988, iso_ir_110, l4, latin4 |
+| iso8859_5 | csisolatincyrillic, cyrillic,
iso_8859_5, iso_8859_5_1988, iso_ir_144 |
+| iso8859_6 | arabic, asmo_708, csisolatinarabic,
ecma_114, iso_8859_6, iso_8859_6_1987, iso_ir_127 |
+| iso8859_7 | csisolatingreek, ecma_118, elot_928,
greek, greek8, iso_8859_7, iso_8859_7_1987, iso_ir_126 |
+| iso8859_8 | csisolatinhebrew, hebrew,
iso_8859_8, iso_8859_8_1988, iso_ir_138 |
+| iso8859_9 | csisolatin5, iso_8859_9,
iso_8859_9_1989, iso_ir_148, l5, latin5 |
+| johab |
cp1361, ms1361 |
+| koi8_r |
cskoi8r |
+| kz1048 | kz_1048,
rk1048, strk1048_2002 |
+| latin_1 | 8859, cp819, csisolatin1, ibm819, iso8859,
iso8859_1, iso_8859_1, iso_8859_1_1987, iso_ir_100, l1, latin, latin1 |
+| mac_cyrillic |
maccyrillic |
+| mac_greek |
macgreek |
+| mac_iceland |
maciceland |
+| mac_latin2 |
maccentraleurope, maclatin2 |
+| mac_roman |
macintosh, macroman |
+| mac_turkish |
macturkish |
+| mbcs |
ansi, dbcs |
+| ptcp154 | csptcp154,
pt154, cp154, cyrillic_asian |
+| quopri_codec | quopri,
quoted_printable, quotedprintable |
+| rot_13 |
rot13 |
+| shift_jis | csshiftjis,
shiftjis, sjis, s_jis, x_mac_japanese |
+| shift_jis_2004 |
shiftjis2004, sjis_2004, s_jis_2004
|
+| shift_jisx0213 |
shiftjisx0213, sjisx0213, s_jisx0213
|
+| tactis |
tis260 |
+| tis_620 | tis620, tis_620_0,
tis_620_2529_0, tis_620_2529_1, iso_ir_166 |
+| utf_16 |
u16, utf16 |
+| utf_16_be |
unicodebigunmarked, utf_16be |
+| utf_16_le |
unicodelittleunmarked, utf_16le
|
+| utf_32 |
u32, utf32 |
+| utf_32_be |
utf_32be |
+| utf_32_le |
utf_32le |
+| utf_7 | u7,
utf7, unicode_1_1_utf_7 |
+| utf_8 | u8, utf,
utf8, utf8_ucs2, utf8_ucs4 |
++-----------------+----------------------------------------------------------------------------------------------------------------------------------+
+
+-------
+Supported Languages
+-------
+
+Those language can be detected inside your content. All of these are specified
in ./charset_normalizer/assets/frequencies.json .
+
+['English',
+ 'German',
+ 'French',
+ 'Dutch',
+ 'Italian',
+ 'Polish',
+ 'Spanish',
+ 'Russian',
+ 'Japanese',
+ 'Portuguese',
+ 'Swedish',
+ 'Chinese',
+ 'Catalan',
+ 'Ukrainian',
+ 'Norwegian',
+ 'Finnish',
+ 'Vietnamese',
+ 'Czech',
+ 'Hungarian',
+ 'Korean',
+ 'Indonesian',
+ 'Turkish',
+ 'Romanian',
+ 'Farsi',
+ 'Arabic',
+ 'Danish',
+ 'Esperanto',
+ 'Serbian',
+ 'Lithuanian',
+ 'Slovene',
+ 'Slovak',
+ 'Malay',
+ 'Hebrew',
+ 'Bulgarian',
+ 'Kazakh',
+ 'Baque',
+ 'Volapük',
+ 'Croatian',
+ 'Hindi',
+ 'Estonian',
+ 'Azeri',
+ 'Galician',
+ 'Simple English',
+ 'Nynorsk',
+ 'Thai',
+ 'Greek',
+ 'Macedonian',
+ 'Serbocroatian',
+ 'Tamil',
+ 'Classical Chinese']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/setup.py
new/charset_normalizer-1.3.0/setup.py
--- old/charset_normalizer-1.1.1/setup.py 2019-09-23 14:45:48.000000000
+0200
+++ new/charset_normalizer-1.3.0/setup.py 2019-09-30 20:01:29.000000000
+0200
@@ -13,17 +13,19 @@
EMAIL = '[email protected]'
AUTHOR = 'Ahmed TAHRI @Ousret'
REQUIRES_PYTHON = '>=3.5.0'
-VERSION = '1.1.1'
+VERSION = '1.3.0'
REQUIRED = [
'cached_property',
'dragonmapper',
'zhon',
- 'prettytable'
+ 'prettytable',
+ 'loguru'
]
EXTRAS = {
- 'permit to generate frequencies.json': ['requests_html', 'requests'],
+ 'LetterFrequency': ['requests_html', 'requests'],
+ 'UnicodeDataBackport': ['unicodedata2']
}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/charset_normalizer-1.1.1/test/test_cli.py
new/charset_normalizer-1.3.0/test/test_cli.py
--- old/charset_normalizer-1.1.1/test/test_cli.py 2019-09-23
14:45:48.000000000 +0200
+++ new/charset_normalizer-1.3.0/test/test_cli.py 2019-09-30
20:01:29.000000000 +0200
@@ -1,6 +1,8 @@
import unittest
from charset_normalizer.cli.normalizer import cli_detect, query_yes_no
from unittest.mock import patch
+from os.path import exists
+from os import remove
class TestCommandLineInterface(unittest.TestCase):
@@ -26,6 +28,23 @@
)
)
+ def test_single_file_normalize(self):
+ self.assertEqual(
+ 0,
+ cli_detect(
+ ['./data/sample.1.ar.srt', '--normalize']
+ )
+ )
+
+ self.assertTrue(
+ exists('./data/sample.1.ar.cp1256.srt')
+ )
+
+ try:
+ remove('./data/sample.1.ar.cp1256.srt')
+ except:
+ pass
+
def test_single_verbose_file(self):
self.assertEqual(
0,