Hello community,

here is the log from the commit of package python-ftfy for openSUSE:Factory 
checked in at 2019-08-13 13:24:47
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-ftfy (Old)
 and      /work/SRC/openSUSE:Factory/.python-ftfy.new.9556 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "python-ftfy"

Tue Aug 13 13:24:47 2019 rev:4 rq:722792 version:5.6

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-ftfy/python-ftfy.changes  2018-10-25 
08:14:23.880120912 +0200
+++ /work/SRC/openSUSE:Factory/.python-ftfy.new.9556/python-ftfy.changes        
2019-08-13 13:24:48.385362459 +0200
@@ -1,0 +2,13 @@
+Mon Aug 12 12:31:18 UTC 2019 - Marketa Calabkova <[email protected]>
+
+- Update to version 5.6
+  * The unescape_html function now supports all the HTML5 entities 
+    that appear in html.entities.html5, including those with long 
+    names such as &DiacriticalDoubleAcute;.
+  * Unescaping of numeric HTML entities now uses the standard library's 
+    html.unescape, making edge cases consistent.
+  * On top of Python's support for HTML5 entities, ftfy will also 
+    convert HTML escapes of common Latin capital letters that are 
+    (nonstandardly) written in all caps, such as &NTILDE; for Ñ.
+
+-------------------------------------------------------------------

Old:
----
  v5.5.1.tar.gz

New:
----
  v5.6.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-ftfy.spec ++++++
--- /var/tmp/diff_new_pack.fVskUk/_old  2019-08-13 13:24:49.489362169 +0200
+++ /var/tmp/diff_new_pack.fVskUk/_new  2019-08-13 13:24:49.489362169 +0200
@@ -1,7 +1,7 @@
 #
 # spec file for package python-ftfy
 #
-# Copyright (c) 2018 SUSE LINUX GmbH, Nuernberg, Germany.
+# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany.
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -19,24 +19,23 @@
 %{?!python_module:%define python_module() python-%{**} python3-%{**}}
 %define         skip_python2 1
 Name:           python-ftfy
-Version:        5.5.1
+Version:        5.6
 Release:        0
 Summary:        Python module for repairing mis-decoded Unicode text
 License:        MIT
 Group:          Development/Languages/Python
-Url:            http://github.com/LuminosoInsight/python-ftfy
+URL:            https://github.com/LuminosoInsight/python-ftfy
 Source:         
https://github.com/LuminosoInsight/python-ftfy/archive/v%{version}.tar.gz
 BuildRequires:  %{python_module setuptools}
 BuildRequires:  fdupes
 BuildRequires:  python-rpm-macros
+Requires:       python-wcwidth
+BuildArch:      noarch
 # SECTION test requirements
 BuildRequires:  %{python_module pytest-runner}
 BuildRequires:  %{python_module pytest}
 BuildRequires:  %{python_module wcwidth}
 # /SECTION
-Requires:       python-wcwidth
-BuildArch:      noarch
-
 %python_subpackages
 
 %description
@@ -54,12 +53,8 @@
 %python_expand %fdupes %{buildroot}%{$python_sitelib}
 
 %check
-%{python_expand export PYTHONDONTWRITEBYTECODE=1
-export LANG=en_US.UTF-8
-export PYTHONPATH=%{buildroot}%{$python_sitelib}
 export PATH="$PATH:%{buildroot}%{_bindir}"
-py.test-%{$python_bin_suffix}
-}
+%pytest
 
 %files %{python_files}
 %doc CHANGELOG.md README.md

++++++ v5.5.1.tar.gz -> v5.6.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/CHANGELOG.md 
new/python-ftfy-5.6/CHANGELOG.md
--- old/python-ftfy-5.5.1/CHANGELOG.md  2018-09-24 17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/CHANGELOG.md    2019-08-07 22:32:36.000000000 +0200
@@ -1,3 +1,22 @@
+## Version 5.6 (August 7, 2019)
+
+- The `unescape_html` function now supports all the HTML5 entities that appear
+  in `html.entities.html5`, including those with long names such as
+  `&DiacriticalDoubleAcute;`.
+
+- Unescaping of numeric HTML entities now uses the standard library's
+  `html.unescape`, making edge cases consistent.
+
+  (The reason we don't run `html.unescape` on all text is that it's not always
+  appropriate to apply, and can lead to false positive fixes. The text
+  "This&NotThat" should not have "&Not" replaced by a symbol, as
+  `html.unescape` would do.)
+
+- On top of Python's support for HTML5 entities, ftfy will also convert HTML
+  escapes of common Latin capital letters that are (nonstandardly) written
+  in all caps, such as `&NTILDE;` for `Ñ`.
+
+
 ## Version 5.5.1 (September 14, 2018)
 
 - Added Python 3.7 support.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/Jenkinsfile 
new/python-ftfy-5.6/Jenkinsfile
--- old/python-ftfy-5.5.1/Jenkinsfile   1970-01-01 01:00:00.000000000 +0100
+++ new/python-ftfy-5.6/Jenkinsfile     2019-08-07 22:32:36.000000000 +0200
@@ -0,0 +1,4 @@
+wheelJob(
+    test: 'pytest',
+    upstream: [ 'wheelhouse-init' ]
+)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/LICENSE.txt 
new/python-ftfy-5.6/LICENSE.txt
--- old/python-ftfy-5.5.1/LICENSE.txt   2018-09-24 17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/LICENSE.txt     2019-08-07 22:32:36.000000000 +0200
@@ -1,4 +1,4 @@
-Copyright (C) 2013 Rob Speer ([email protected])
+Copyright (C) 2013-2018 Robyn Speer ([email protected])
 MIT License
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/README.md 
new/python-ftfy-5.6/README.md
--- old/python-ftfy-5.5.1/README.md     2018-09-24 17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/README.md       2019-08-07 22:32:36.000000000 +0200
@@ -1,36 +1,36 @@
 # ftfy: fixes text for you
 
 
[![Travis](https://img.shields.io/travis/LuminosoInsight/python-ftfy/master.svg?label=Travis%20CI)](https://travis-ci.org/LuminosoInsight/python-ftfy)
-[![PyPI 
package](https://badge.fury.io/py/ftfy.svg)](http://badge.fury.io/py/ftfy)
-[![Docs](https://readthedocs.org/projects/ftfy/badge/?version=latest)](http://ftfy.readthedocs.org/en/latest/)
+[![PyPI 
package](https://badge.fury.io/py/ftfy.svg)](https://badge.fury.io/py/ftfy)
+[![Docs](https://readthedocs.org/projects/ftfy/badge/?version=latest)](https://ftfy.readthedocs.org/en/latest/)
 
 ```python
 >>> print(fix_encoding("(ง'⌣')ง"))
 (ง'⌣')ง
 ```
 
-Full documentation: **http://ftfy.readthedocs.org**
+Full documentation: **https://ftfy.readthedocs.org**
 
 ## Testimonials
 
 - “My life is livable again!”
-  — [@planarrowspace](http://twitter.com/planarrowspace)
+  — [@planarrowspace](https://twitter.com/planarrowspace)
 - “A handy piece of magic”
-  — [@simonw](http://twitter.com/simonw)
+  — [@simonw](https://twitter.com/simonw)
 - “Saved me a large amount of frustrating dev work”
-  — [@iancal](http://twitter.com/iancal)
+  — [@iancal](https://twitter.com/iancal)
 - “ftfy did the right thing right away, with no faffing about. Excellent work, 
solving a very tricky real-world (whole-world!) problem.”
   — Brennan Young
 - “Hat mir die Tage geholfen. Im Übrigen bin ich der Meinung, dass wir keine 
komplexen Maschinen mit Computern bauen sollten solange wir nicht einmal 
Umlaute sicher verarbeiten können. :D”
-  — [Bruno Ranieri](http://yrrsinn.de/2012/09/17/gelesen-kw37/)
+  — [Bruno Ranieri](https://yrrsinn.de/2012/09/17/gelesen-kw37/)
 - “I have no idea when I’m gonna need this, but I’m definitely bookmarking it.”
-  — [/u/ocrow](http://reddit.com/u/ocrow)
+  — [/u/ocrow](https://reddit.com/u/ocrow)
 - “9.2/10”
   — [pylint](https://bitbucket.org/logilab/pylint/)
 
 ## Developed at Luminoso
 
-[Luminoso](http://www.luminoso.com) makes groundbreaking software for text
+[Luminoso](https://www.luminoso.com) makes groundbreaking software for text
 analytics that really understands what words mean, in many languages. Our
 software is used by enterprise customers such as Sony, Intel, Mars, and Scotts,
 and it's built on Python and open-source technologies.
@@ -39,7 +39,7 @@
 is making sure it has the correct characters in it!
 
 Luminoso is growing fast and hiring. If you're interested in joining us, take a
-look at [our careers page](http://www.luminoso.com/career.html).
+look at [our careers page](https://luminoso.com/about/work-here).
 
 ## What it does
 
@@ -83,7 +83,7 @@
 taste of the things it can do. `fix_encoding` is the more specific function
 that only fixes mojibake.
 
-Please read [the documentation](http://ftfy.readthedocs.org) for more
+Please read [the documentation](https://ftfy.readthedocs.org) for more
 information on what ftfy does, and how to configure it for your needs.
 
 
@@ -136,11 +136,40 @@
 
 ## Who maintains ftfy?
 
-I'm Rob Speer ([email protected]).  I develop this tool as part of my
-text-understanding company, [Luminoso](http://luminoso.com), where it has
+I'm Robyn Speer ([email protected]). I develop this tool as part of my
+text-understanding company, [Luminoso](https://luminoso.com), where it has
 proven essential.
 
 Luminoso provides ftfy as free, open source software under the extremely
 permissive MIT license.
 
 You can report bugs regarding ftfy on GitHub and we'll handle them.
+
+
+## Citing ftfy
+
+ftfy has been used as a crucial data processing step in major NLP research.
+
+It's important to give credit appropriately to everyone whose work you build on
+in research. This includes software, not just high-status contributions such as
+mathematical models. All I ask when you use ftfy for research is that you cite
+it.
+
+ftfy has a citable record [on Zenodo](https://zenodo.org/record/2591652).
+A citation of ftfy may look like this:
+
+    Robyn Speer. (2019). ftfy (Version 5.5). Zenodo.
+    http://doi.org/10.5281/zenodo.2591652
+
+In BibTeX format, the citation is::
+
+    @misc{speer-2019-ftfy,
+      author       = {Robyn Speer},
+      title        = {ftfy},
+      note         = {Version 5.5},
+      year         = 2019,
+      howpublished = {Zenodo},
+      doi          = {10.5281/zenodo.2591652},
+      url          = {https://doi.org/10.5281/zenodo.2591652}
+    }
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/docs/conf.py 
new/python-ftfy-5.6/docs/conf.py
--- old/python-ftfy-5.5.1/docs/conf.py  2018-09-24 17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/docs/conf.py    2019-08-07 22:32:36.000000000 +0200
@@ -41,16 +41,16 @@
 
 # General information about the project.
 project = u'ftfy'
-copyright = u'2017, Rob Speer'
+copyright = u'2018, Robyn Speer'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '5.3'
+version = '5.6'
 # The full version, including alpha/beta/rc tags.
-release = '5.3.0'
+release = '5.6.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -185,10 +185,7 @@
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass 
[howto/manual]).
-latex_documents = [
-  ('index', 'ftfy.tex', u'ftfy Documentation',
-   u'Rob Speer', 'manual'),
-]
+latex_documents = []
 
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
@@ -216,8 +213,7 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'ftfy', u'ftfy Documentation',
-     [u'Rob Speer'], 1)
+    ('index', 'ftfy', 'ftfy Documentation', ['Robyn Speer'], 1)
 ]
 
 # If true, show URL addresses after external links.
@@ -229,11 +225,7 @@
 # Grouping the document tree into Texinfo files. List of tuples
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
-texinfo_documents = [
-  ('index', 'ftfy', u'ftfy Documentation',
-   u'Rob Speer', 'ftfy', 'One line description of project.',
-   'Miscellaneous'),
-]
+texinfo_documents = []
 
 # Documents to append as an appendix to all manuals.
 #texinfo_appendices = []
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/docs/index.rst 
new/python-ftfy-5.6/docs/index.rst
--- old/python-ftfy-5.5.1/docs/index.rst        2018-09-24 17:39:09.000000000 
+0200
+++ new/python-ftfy-5.6/docs/index.rst  2019-08-07 22:32:36.000000000 +0200
@@ -147,6 +147,36 @@
 mojibake into the text.
 
 
+Citing ftfy
+-----------
+ftfy has been used as a crucial data processing step in major NLP research.
+
+It's important to give credit appropriately to everyone whose work you build on
+in research. This includes software, not just high-status contributions such as
+mathematical models. All I ask when you use ftfy for research is that you cite
+it. 
+
+ftfy has a citable record [on Zenodo](https://zenodo.org/record/2591652).
+A citation of ftfy may look like this:
+
+    Robyn Speer. (2019). ftfy (Version 5.5). Zenodo.
+    http://doi.org/10.5281/zenodo.2591652
+
+In BibTeX format, the citation is::
+
+    @misc{speer-2019-ftfy,
+      author       = {Robyn Speer},
+      title        = {ftfy},
+      note         = {Version 5.5},
+      year         = 2019,
+      howpublished = {Zenodo},
+      doi          = {10.5281/zenodo.2591652},
+      url          = {https://doi.org/10.5281/zenodo.2591652}
+    }
+
+
+
+
 Using ftfy
 ----------
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/ftfy/__init__.py 
new/python-ftfy-5.6/ftfy/__init__.py
--- old/python-ftfy-5.5.1/ftfy/__init__.py      2018-09-24 17:39:09.000000000 
+0200
+++ new/python-ftfy-5.6/ftfy/__init__.py        2019-08-07 22:32:36.000000000 
+0200
@@ -6,31 +6,34 @@
 """
 
 import unicodedata
+
 import ftfy.bad_codecs
 from ftfy import fixes
 from ftfy.formatting import display_ljust
 
-__version__ = '5.5.1'
+__version__ = '5.6'
 
 
 # See the docstring for ftfy.bad_codecs to see what we're doing here.
 ftfy.bad_codecs.ok()
 
 
-def fix_text(text,
-             *,
-             fix_entities='auto',
-             remove_terminal_escapes=True,
-             fix_encoding=True,
-             fix_latin_ligatures=True,
-             fix_character_width=True,
-             uncurl_quotes=True,
-             fix_line_breaks=True,
-             fix_surrogates=True,
-             remove_control_chars=True,
-             remove_bom=True,
-             normalization='NFC',
-             max_decode_length=10**6):
+def fix_text(
+    text,
+    *,
+    fix_entities='auto',
+    remove_terminal_escapes=True,
+    fix_encoding=True,
+    fix_latin_ligatures=True,
+    fix_character_width=True,
+    uncurl_quotes=True,
+    fix_line_breaks=True,
+    fix_surrogates=True,
+    remove_control_chars=True,
+    remove_bom=True,
+    normalization='NFC',
+    max_decode_length=10 ** 6
+):
     r"""
     Given Unicode text as input, fix inconsistencies and glitches in it,
     such as mojibake.
@@ -178,33 +181,36 @@
                 fix_surrogates=fix_surrogates,
                 remove_control_chars=remove_control_chars,
                 remove_bom=remove_bom,
-                normalization=normalization
+                normalization=normalization,
             )
         )
         pos = textbreak
 
     return ''.join(out)
 
+
 # Some alternate names for the main functions
 ftfy = fix_text
 fix_encoding = fixes.fix_encoding
 fix_text_encoding = fixes.fix_text_encoding  # deprecated
 
 
-def fix_file(input_file,
-             encoding=None,
-             *,
-             fix_entities='auto',
-             remove_terminal_escapes=True,
-             fix_encoding=True,
-             fix_latin_ligatures=True,
-             fix_character_width=True,
-             uncurl_quotes=True,
-             fix_line_breaks=True,
-             fix_surrogates=True,
-             remove_control_chars=True,
-             remove_bom=True,
-             normalization='NFC'):
+def fix_file(
+    input_file,
+    encoding=None,
+    *,
+    fix_entities='auto',
+    remove_terminal_escapes=True,
+    fix_encoding=True,
+    fix_latin_ligatures=True,
+    fix_character_width=True,
+    uncurl_quotes=True,
+    fix_line_breaks=True,
+    fix_surrogates=True,
+    remove_control_chars=True,
+    remove_bom=True,
+    normalization='NFC'
+):
     """
     Fix text that is found in a file.
 
@@ -236,23 +242,25 @@
             fix_surrogates=fix_surrogates,
             remove_control_chars=remove_control_chars,
             remove_bom=remove_bom,
-            normalization=normalization
+            normalization=normalization,
         )
 
 
-def fix_text_segment(text,
-                     *,
-                     fix_entities='auto',
-                     remove_terminal_escapes=True,
-                     fix_encoding=True,
-                     fix_latin_ligatures=True,
-                     fix_character_width=True,
-                     uncurl_quotes=True,
-                     fix_line_breaks=True,
-                     fix_surrogates=True,
-                     remove_control_chars=True,
-                     remove_bom=True,
-                     normalization='NFC'):
+def fix_text_segment(
+    text,
+    *,
+    fix_entities='auto',
+    remove_terminal_escapes=True,
+    fix_encoding=True,
+    fix_latin_ligatures=True,
+    fix_character_width=True,
+    uncurl_quotes=True,
+    fix_line_breaks=True,
+    fix_surrogates=True,
+    remove_control_chars=True,
+    remove_bom=True,
+    normalization='NFC'
+):
     """
     Apply fixes to text in a single chunk. This could be a line of text
     within a larger run of `fix_text`, or it could be a larger amount
@@ -402,9 +410,11 @@
             display = char
         else:
             display = char.encode('unicode-escape').decode('ascii')
-        print('U+{code:04X}  {display} [{category}] {name}'.format(
-            display=display_ljust(display, 7),
-            code=ord(char),
-            category=unicodedata.category(char),
-            name=unicodedata.name(char, '<unknown>')
-        ))
+        print(
+            'U+{code:04X}  {display} [{category}] {name}'.format(
+                display=display_ljust(display, 7),
+                code=ord(char),
+                category=unicodedata.category(char),
+                name=unicodedata.name(char, '<unknown>'),
+            )
+        )
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/ftfy/bad_codecs/sloppy.py 
new/python-ftfy-5.6/ftfy/bad_codecs/sloppy.py
--- old/python-ftfy-5.5.1/ftfy/bad_codecs/sloppy.py     2018-09-24 
17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/ftfy/bad_codecs/sloppy.py       2019-08-07 
22:32:36.000000000 +0200
@@ -149,6 +149,7 @@
         streamwriter=StreamWriter,
     )
 
+
 # Define a codec for each incomplete encoding. The resulting CODECS dictionary
 # can be used by the main module of ftfy.bad_codecs.
 CODECS = {}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/ftfy/badness.py 
new/python-ftfy-5.6/ftfy/badness.py
--- old/python-ftfy-5.5.1/ftfy/badness.py       2018-09-24 17:39:09.000000000 
+0200
+++ new/python-ftfy-5.6/ftfy/badness.py 2019-08-07 22:32:36.000000000 +0200
@@ -5,6 +5,7 @@
 
 import re
 import unicodedata
+
 from ftfy.chardata import chars_to_classes
 
 # The following regex uses the mapping of character classes to ASCII
@@ -85,12 +86,13 @@
     exclusive_categories = 'MmN13'
     for cat1 in exclusive_categories:
         others_range = ''.join(c for c in exclusive_categories if c != cat1)
-        groups.append('{cat1}[{others_range}]'.format(
-            cat1=cat1, others_range=others_range
-        ))
+        groups.append(
+            '{cat1}[{others_range}]'.format(cat1=cat1, 
others_range=others_range)
+        )
     regex = '|'.join(groups)
     return re.compile(regex)
 
+
 WEIRDNESS_RE = _make_weirdness_regex()
 
 # These characters appear in mojibake but also appear commonly on their own.
@@ -152,6 +154,7 @@
     'вЂ[љћ¦°№™ќ“”]'
 )
 
+
 def sequence_weirdness(text):
     """
     Determine how often a text has unexpected characters or sequences of
@@ -181,9 +184,8 @@
     """
     text2 = unicodedata.normalize('NFC', text)
     weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
-    adjustment = (
-        len(MOJIBAKE_SYMBOL_RE.findall(text2)) * 2 -
-        len(COMMON_SYMBOL_RE.findall(text2))
+    adjustment = len(MOJIBAKE_SYMBOL_RE.findall(text2)) * 2 - len(
+        COMMON_SYMBOL_RE.findall(text2)
     )
     return weirdness * 2 + adjustment
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/ftfy/build_data.py 
new/python-ftfy-5.6/ftfy/build_data.py
--- old/python-ftfy-5.5.1/ftfy/build_data.py    2018-09-24 17:39:09.000000000 
+0200
+++ new/python-ftfy-5.6/ftfy/build_data.py      2019-08-07 22:32:36.000000000 
+0200
@@ -11,8 +11,8 @@
 
 The file will be written to the current directory.
 """
-import unicodedata
 import sys
+import unicodedata
 import zlib
 
 # L = Latin capital letter
@@ -47,9 +47,7 @@
     raise an error unless you pass `do_it_anyway=True`.
     """
     if sys.hexversion < 0x030700f0 and not do_it_anyway:
-        raise RuntimeError(
-            "This function should be run in Python 3.7.0 or later."
-        )
+        raise RuntimeError("This function should be run in Python 3.7.0 or 
later.")
 
     cclasses = [None] * 0x110000
     for codepoint in range(0x0, 0x110000):
@@ -125,5 +123,6 @@
     out.write(zlib.compress(''.join(cclasses).encode('ascii')))
     out.close()
 
+
 if __name__ == '__main__':
     make_char_data_file()
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/ftfy/chardata.py 
new/python-ftfy-5.6/ftfy/chardata.py
--- old/python-ftfy-5.5.1/ftfy/chardata.py      2018-09-24 17:39:09.000000000 
+0200
+++ new/python-ftfy-5.6/ftfy/chardata.py        2019-08-07 22:32:36.000000000 
+0200
@@ -3,10 +3,12 @@
 encodings that use them.
 """
 
+import html
+import itertools
 import re
-import zlib
 import unicodedata
-import itertools
+import zlib
+
 from pkg_resources import resource_string
 
 # These are the encodings we will try to fix in ftfy, in the
@@ -47,9 +49,36 @@
         regex = '^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
         encoding_regexes[encoding] = re.compile(regex)
     return encoding_regexes
+
+
 ENCODING_REGEXES = _build_regexes()
 
 
+def _build_html_entities():
+    entities = dict(html.entities.html5)
+    entities = {}
+    # Create a dictionary based on the built-in HTML5 entity dictionary.
+    # Add a limited set of HTML entities that we'll also decode if they've
+    # been case-folded to uppercase, such as decoding &NTILDE; as "Ñ".
+    for name, char in html.entities.html5.items():
+        if name.endswith(';'):
+            entities['&' + name] = char
+
+            # Restrict the set of characters we can attempt to decode if their
+            # name has been uppercased. If we tried to handle all entity names,
+            # the results would be ambiguous.
+            if name == name.lower():
+                name_upper = name.upper()
+                entity_upper = '&' + name_upper
+                if html.unescape(entity_upper) == entity_upper:
+                    entities[entity_upper] = char.upper()
+    return entities
+
+
+HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};")
+HTML_ENTITIES = _build_html_entities()
+
+
 def _build_utf8_punct_regex():
     """
     Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
@@ -64,10 +93,14 @@
     # are a contiguous range, as well as the different Windows-1252 decodings
     # of 0x80 to 0x9f, which are not contiguous at all. (Latin-1 and
     # Windows-1252 agree on bytes 0xa0 and up.)
-    obvious_utf8 = ('â[€\x80][\x80-\xbf'
-                    + bytes(range(0x80, 0xa0)).decode('sloppy-windows-1252')
-                    + ']')
+    obvious_utf8 = (
+        'â[€\x80][\x80-\xbf'
+        + bytes(range(0x80, 0xa0)).decode('sloppy-windows-1252')
+        + ']'
+    )
     return re.compile(obvious_utf8)
+
+
 PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
 
 
@@ -92,12 +125,14 @@
 # We should consider checking for b'\x85' being converted to ... in the future.
 # I've seen it once, but the text still wasn't recoverable.
 
-ALTERED_UTF8_RE = re.compile(b'[\xc2\xc3\xc5\xce\xd0][ ]'
-                             b'|[\xe0-\xef][ ][\x80-\xbf]'
-                             b'|[\xe0-\xef][\x80-\xbf][ ]'
-                             b'|[\xf0-\xf4][ ][\x80-\xbf][\x80-\xbf]'
-                             b'|[\xf0-\xf4][\x80-\xbf][ ][\x80-\xbf]'
-                             b'|[\xf0-\xf4][\x80-\xbf][\x80-\xbf][ ]')
+ALTERED_UTF8_RE = re.compile(
+    b'[\xc2\xc3\xc5\xce\xd0][ ]'
+    b'|[\xe0-\xef][ ][\x80-\xbf]'
+    b'|[\xe0-\xef][\x80-\xbf][ ]'
+    b'|[\xf0-\xf4][ ][\x80-\xbf][\x80-\xbf]'
+    b'|[\xf0-\xf4][\x80-\xbf][ ][\x80-\xbf]'
+    b'|[\xf0-\xf4][\x80-\xbf][\x80-\xbf][ ]'
+)
 
 # This expression matches UTF-8 and CESU-8 sequences where some of the
 # continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
@@ -160,19 +195,21 @@
     control_chars = {}
 
     for i in itertools.chain(
-            range(0x00, 0x09),
-            [0x0b],
-            range(0x0e, 0x20),
-            [0x7f],
-            range(0x206a, 0x2070),
-            [0xfeff],
-            range(0xfff9, 0xfffd),
-            range(0x1d173, 0x1d17b),
-            range(0xe0000, 0xe0080)
+        range(0x00, 0x09),
+        [0x0b],
+        range(0x0e, 0x20),
+        [0x7f],
+        range(0x206a, 0x2070),
+        [0xfeff],
+        range(0xfff9, 0xfffd),
+        range(0x1d173, 0x1d17b),
+        range(0xe0000, 0xe0080),
     ):
         control_chars[i] = None
 
     return control_chars
+
+
 CONTROL_CHARS = _build_control_char_mapping()
 
 
@@ -188,10 +225,10 @@
 # is sometimes more normalization than you want.
 
 LIGATURES = {
-    ord('IJ'): 'IJ',   # Dutch ligatures
+    ord('IJ'): 'IJ',  # Dutch ligatures
     ord('ij'): 'ij',
-    ord('ʼn'): "ʼn",   # Afrikaans digraph meant to avoid auto-curled quote
-    ord('DZ'): 'DZ',   # Serbian/Croatian digraphs for Cyrillic conversion
+    ord('ʼn'): "ʼn",  # Afrikaans digraph meant to avoid auto-curled quote
+    ord('DZ'): 'DZ',  # Serbian/Croatian digraphs for Cyrillic conversion
     ord('Dz'): 'Dz',
     ord('dz'): 'dz',
     ord('DŽ'): 'DŽ',
@@ -203,7 +240,7 @@
     ord('NJ'): 'NJ',
     ord('Nj'): 'Nj',
     ord('nj'): "nj",
-    ord('ff'): 'ff',   # Latin typographical ligatures
+    ord('ff'): 'ff',  # Latin typographical ligatures
     ord('fi'): 'fi',
     ord('fl'): 'fl',
     ord('ffi'): 'ffi',
@@ -228,4 +265,6 @@
         if alternate != char:
             width_map[i] = alternate
     return width_map
+
+
 WIDTH_MAP = _build_width_map()
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/ftfy/cli.py 
new/python-ftfy-5.6/ftfy/cli.py
--- old/python-ftfy-5.5.1/ftfy/cli.py   2018-09-24 17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/ftfy/cli.py     2019-08-07 22:32:36.000000000 +0200
@@ -1,10 +1,10 @@
 """
 A command-line utility for fixing text found in a file.
 """
-import sys
 import os
-from ftfy import fix_file, __version__
+import sys
 
+from ftfy import __version__, fix_file
 
 ENCODE_ERROR_TEXT_UNIX = """ftfy error:
 Unfortunately, this output stream does not support Unicode.
@@ -38,6 +38,7 @@
 Can't read and write the same file. Please output to a new file instead.
 """
 
+
 def main():
     """
     Run ftfy as a command-line utility.
@@ -47,24 +48,49 @@
     parser = argparse.ArgumentParser(
         description="ftfy (fixes text for you), version %s" % __version__
     )
-    parser.add_argument('filename', default='-', nargs='?',
-                        help='The file whose Unicode is to be fixed. Defaults '
-                             'to -, meaning standard input.')
-    parser.add_argument('-o', '--output', type=str, default='-',
-                        help='The file to output to. Defaults to -, meaning '
-                             'standard output.')
-    parser.add_argument('-g', '--guess', action='store_true',
-                        help="Ask ftfy to guess the encoding of your input. "
-                             "This is risky. Overrides -e.")
-    parser.add_argument('-e', '--encoding', type=str, default='utf-8',
-                        help='The encoding of the input. Defaults to UTF-8.')
-    parser.add_argument('-n', '--normalization', type=str, default='NFC',
-                        help='The normalization of Unicode to apply. '
-                             'Defaults to NFC. Can be "none".')
-    parser.add_argument('--preserve-entities', action='store_true',
-                        help="Leave HTML entities as they are. The default "
-                             "is to decode them, as long as no HTML tags "
-                             "have appeared in the file.")
+    parser.add_argument(
+        'filename',
+        default='-',
+        nargs='?',
+        help='The file whose Unicode is to be fixed. Defaults '
+        'to -, meaning standard input.',
+    )
+    parser.add_argument(
+        '-o',
+        '--output',
+        type=str,
+        default='-',
+        help='The file to output to. Defaults to -, meaning ' 'standard 
output.',
+    )
+    parser.add_argument(
+        '-g',
+        '--guess',
+        action='store_true',
+        help="Ask ftfy to guess the encoding of your input. "
+        "This is risky. Overrides -e.",
+    )
+    parser.add_argument(
+        '-e',
+        '--encoding',
+        type=str,
+        default='utf-8',
+        help='The encoding of the input. Defaults to UTF-8.',
+    )
+    parser.add_argument(
+        '-n',
+        '--normalization',
+        type=str,
+        default='NFC',
+        help='The normalization of Unicode to apply. '
+        'Defaults to NFC. Can be "none".',
+    )
+    parser.add_argument(
+        '--preserve-entities',
+        action='store_true',
+        help="Leave HTML entities as they are. The default "
+        "is to decode them, as long as no HTML tags "
+        "have appeared in the file.",
+    )
 
     args = parser.parse_args()
 
@@ -97,9 +123,12 @@
         fix_entities = 'auto'
 
     try:
-        for line in fix_file(file, encoding=encoding,
-                             fix_entities=fix_entities,
-                             normalization=normalization):
+        for line in fix_file(
+            file,
+            encoding=encoding,
+            fix_entities=fix_entities,
+            normalization=normalization,
+        ):
             try:
                 outfile.write(line)
             except UnicodeEncodeError:
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/ftfy/fixes.py 
new/python-ftfy-5.6/ftfy/fixes.py
--- old/python-ftfy-5.5.1/ftfy/fixes.py 2018-09-24 17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/ftfy/fixes.py   2019-08-07 22:32:36.000000000 +0200
@@ -3,17 +3,27 @@
 can perform.
 """
 
-import re
 import codecs
+import html
+import re
 import warnings
-from ftfy.chardata import (possible_encoding, CHARMAP_ENCODINGS,
-                           CONTROL_CHARS, LIGATURES, WIDTH_MAP,
-                           PARTIAL_UTF8_PUNCT_RE, ALTERED_UTF8_RE,
-                           LOSSY_UTF8_RE, SINGLE_QUOTE_RE, DOUBLE_QUOTE_RE,
-                           C1_CONTROL_RE)
-from ftfy.badness import text_cost
-from html import entities
 
+from ftfy.badness import text_cost
+from ftfy.chardata import (
+    ALTERED_UTF8_RE,
+    C1_CONTROL_RE,
+    CHARMAP_ENCODINGS,
+    CONTROL_CHARS,
+    DOUBLE_QUOTE_RE,
+    HTML_ENTITIES,
+    HTML_ENTITY_RE,
+    LIGATURES,
+    LOSSY_UTF8_RE,
+    PARTIAL_UTF8_PUNCT_RE,
+    SINGLE_QUOTE_RE,
+    WIDTH_MAP,
+    possible_encoding,
+)
 
 BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
 
@@ -52,54 +62,54 @@
     `fix_encoding` decodes text that looks like it was decoded incorrectly. It
     leaves alone text that doesn't.
 
-        >>> print(fix_encoding('único'))
-        único
+        >>> fix_encoding('único')
+        'único'
 
-        >>> print(fix_encoding('This text is fine already :þ'))
-        This text is fine already :þ
+        >>> fix_encoding('This text is fine already :þ')
+        'This text is fine already :þ'
 
     Because these characters often come from Microsoft products, we allow
     for the possibility that we get not just Unicode characters 128-255, but
     also Windows's conflicting idea of what characters 128-160 are.
 
-        >>> print(fix_encoding('This — should be an em dash'))
-        This — should be an em dash
+        >>> fix_encoding('This — should be an em dash')
+        'This — should be an em dash'
 
     We might have to deal with both Windows characters and raw control
     characters at the same time, especially when dealing with characters like
     0x81 that have no mapping in Windows. This is a string that Python's
     standard `.encode` and `.decode` methods cannot correct.
 
-        >>> print(fix_encoding('This text is sad .â\x81”.'))
-        This text is sad .⁔.
+        >>> fix_encoding('This text is sad .â\x81”.')
+        'This text is sad .⁔.'
 
     However, it has safeguards against fixing sequences of letters and
     punctuation that can occur in valid text. In the following example,
     the last three characters are not replaced with a Korean character,
     even though they could be.
 
-        >>> print(fix_encoding('not such a fan of Charlotte Brontë…”'))
-        not such a fan of Charlotte Brontë…”
+        >>> fix_encoding('not such a fan of Charlotte Brontë…”')
+        'not such a fan of Charlotte Brontë…”'
 
     This function can now recover some complex manglings of text, such as when
     UTF-8 mojibake has been normalized in a way that replaces U+A0 with a
     space:
 
-        >>> print(fix_encoding('The more you know 🌠'))
-        The more you know 🌠
+        >>> fix_encoding('The more you know 🌠')
+        'The more you know 🌠'
 
     Cases of genuine ambiguity can sometimes be addressed by finding other
     characters that are not double-encoded, and expecting the encoding to
     be consistent:
 
-        >>> print(fix_encoding('AHÅ™, the new sofa from IKEA®'))
-        AHÅ™, the new sofa from IKEA®
+        >>> fix_encoding('AHÅ™, the new sofa from IKEA®')
+        'AHÅ™, the new sofa from IKEA®'
 
     Finally, we handle the case where the text is in a single-byte encoding
     that was intended as Windows-1252 all along but read as Latin-1:
 
-        >>> print(fix_encoding('This text was never UTF-8 at all\x85'))
-        This text was never UTF-8 at all…
+        >>> fix_encoding('This text was never UTF-8 at all\x85')
+        'This text was never UTF-8 at all…'
 
     The best version of the text is found using
     :func:`ftfy.badness.text_cost`.
@@ -112,8 +122,7 @@
     """
     A deprecated name for :func:`ftfy.fixes.fix_encoding`.
     """
-    warnings.warn('fix_text_encoding is now known as fix_encoding',
-                  DeprecationWarning)
+    warnings.warn('fix_text_encoding is now known as fix_encoding', 
DeprecationWarning)
     return fix_encoding(text)
 
 
@@ -240,8 +249,7 @@
                 fixed = encoded.decode('windows-1252')
                 steps = []
                 if fixed != text:
-                    steps = [('encode', 'latin-1', 0),
-                             ('decode', 'windows-1252', 1)]
+                    steps = [('encode', 'latin-1', 0), ('decode', 
'windows-1252', 1)]
                 return fixed, steps
             except UnicodeDecodeError:
                 # This text contained characters that don't even make sense
@@ -290,49 +298,71 @@
     return obj
 
 
-HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
-
-
 def _unescape_fixup(match):
     """
     Replace one matched HTML entity with the character it represents,
     if possible.
     """
     text = match.group(0)
-    if text[:2] == "&#":
-        # character reference
-        try:
-            if text[:3] == "&#x":
-                codept = int(text[3:-1], 16)
-            else:
-                codept = int(text[2:-1])
-            if 0x80 <= codept < 0xa0:
-                # Decode this range of characters as Windows-1252, as Web
-                # browsers do in practice.
-                return bytes([codept]).decode('sloppy-windows-1252')
-            else:
-                return chr(codept)
-        except ValueError:
+    if text in HTML_ENTITIES:
+        return HTML_ENTITIES[text]
+    elif text.startswith('&#'):
+        unescaped = html.unescape(text)
+
+        # If html.unescape only decoded part of the string, that's not what
+        # we want. The semicolon should be consumed.
+        if ';' in unescaped:
             return text
+        else:
+            return unescaped
     else:
-        # This is a named entity; if it's a known HTML5 entity, replace
-        # it with the appropriate character.
-        try:
-            return entities.html5[text[1:]]
-        except KeyError:
-            return text
+        return text
 
 
 def unescape_html(text):
     """
-    Decode all three types of HTML entities/character references.
+    Decode HTML entities and character references, including some nonstandard
+    ones written in all-caps.
+
+    Python has a built-in called `html.unescape` that can decode HTML escapes,
+    including a bunch of messy edge cases such as decoding escapes without
+    semicolons such as "&amp".
+
+    If you know you've got HTML-escaped text, applying `html.unescape` is the
+    right way to convert it to plain text. But in ambiguous situations, that
+    would create false positives. For example, the informally written text
+    "this&not that" should not automatically be decoded as "this¬ that".
 
-    Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
-    to it for efficiency: it won't match entities longer than 8 characters,
-    because there are no valid entities like that.
+    In this function, we decode the escape sequences that appear in the
+    `html.entities.html5` dictionary, as long as they are the unambiguous ones
+    that end in semicolons.
 
-        >>> print(unescape_html('&lt;tag&gt;'))
-        <tag>
+    We also decode all-caps versions of Latin letters and common symbols.
+    If a database contains the name 'P&EACUTE;REZ', we can read that and intuit
+    that it was supposed to say 'PÉREZ'. This is limited to a smaller set of
+    entities, because there are many instances where entity names are
+    case-sensitive in complicated ways.
+
+        >>> unescape_html('&lt;tag&gt;')
+        '<tag>'
+
+        >>> unescape_html('&Jscr;ohn &HilbertSpace;ancock')
+        '𝒥ohn ℋancock'
+
+        >>> unescape_html('&checkmark;')
+        '✓'
+
+        >>> unescape_html('P&eacute;rez')
+        'Pérez'
+
+        >>> unescape_html('P&EACUTE;REZ')
+        'PÉREZ'
+
+        >>> unescape_html('BUNDESSTRA&SZLIG;E')
+        'BUNDESSTRASSE'
+
+        >>> unescape_html('&ntilde; &Ntilde; &NTILDE; &nTILDE;')
+        'ñ Ñ Ñ &nTILDE;'
     """
     return HTML_ENTITY_RE.sub(_unescape_fixup, text)
 
@@ -445,9 +475,13 @@
         >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
         What is this \n I don't even
     """
-    return text.replace('\r\n', '\n').replace('\r', '\n')\
-               .replace('\u2028', '\n').replace('\u2029', '\n')\
-               .replace('\u0085', '\n')
+    return (
+        text.replace('\r\n', '\n')
+        .replace('\r', '\n')
+        .replace('\u2028', '\n')
+        .replace('\u2029', '\n')
+        .replace('\u0085', '\n')
+    )
 
 
 SURROGATE_RE = re.compile('[\ud800-\udfff]')
@@ -532,14 +566,17 @@
 
 
 # Define a regex to match valid escape sequences in Python string literals.
-ESCAPE_SEQUENCE_RE = re.compile(r'''
+ESCAPE_SEQUENCE_RE = re.compile(
+    r'''
     ( \\U........      # 8-digit hex escapes
     | \\u....          # 4-digit hex escapes
     | \\x..            # 2-digit hex escapes
     | \\[0-7]{1,3}     # Octal escapes
     | \\N\{[^}]+\}     # Unicode characters by name
     | \\[\\'"abfnrtv]  # Single-character escapes
-    )''', re.UNICODE | re.VERBOSE)
+    )''',
+    re.UNICODE | re.VERBOSE,
+)
 
 
 def decode_escapes(text):
@@ -572,6 +609,7 @@
     because escaped text is not necessarily a mistake, and there is no way
     to distinguish text that's supposed to be escaped from text that isn't.
     """
+
     def decode_match(match):
         "Given a regex match, decode the escape sequence it contains."
         return codecs.decode(match.group(0), 'unicode-escape')
@@ -593,6 +631,7 @@
 
     This is used as a step within `fix_encoding`.
     """
+
     def replacement(match):
         "The function to apply when this regex matches."
         return match.group(0).replace(b'\x20', b'\xa0')
@@ -652,6 +691,7 @@
 
     This is used as a transcoder within `fix_encoding`.
     """
+
     def latin1_to_w1252(match):
         "The function to apply when this regex matches."
         return match.group(0).encode('latin-1').decode('sloppy-windows-1252')
@@ -659,7 +699,7 @@
     def w1252_to_utf8(match):
         "The function to apply when this regex matches."
         return match.group(0).encode('sloppy-windows-1252').decode('utf-8')
-    
+
     text = C1_CONTROL_RE.sub(latin1_to_w1252, text)
     return PARTIAL_UTF8_PUNCT_RE.sub(w1252_to_utf8, text)
 
@@ -667,5 +707,5 @@
 TRANSCODERS = {
     'restore_byte_a0': restore_byte_a0,
     'replace_lossy_sequences': replace_lossy_sequences,
-    'fix_partial_utf8_punct_in_1252': fix_partial_utf8_punct_in_1252
+    'fix_partial_utf8_punct_in_1252': fix_partial_utf8_punct_in_1252,
 }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/ftfy/formatting.py 
new/python-ftfy-5.6/ftfy/formatting.py
--- old/python-ftfy-5.5.1/ftfy/formatting.py    2018-09-24 17:39:09.000000000 
+0200
+++ new/python-ftfy-5.6/ftfy/formatting.py      2019-08-07 22:32:36.000000000 
+0200
@@ -6,7 +6,8 @@
 the 'wcwidth' library.
 """
 from unicodedata import normalize
-from wcwidth import wcwidth, wcswidth
+
+from wcwidth import wcswidth, wcwidth
 
 
 def character_width(char):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/setup.py 
new/python-ftfy-5.6/setup.py
--- old/python-ftfy-5.5.1/setup.py      2018-09-24 17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/setup.py        2019-08-07 22:32:36.000000000 +0200
@@ -27,7 +27,7 @@
 
 setup(
     name="ftfy",
-    version='5.5.1',
+    version='5.6',
     maintainer='Luminoso Technologies, Inc.',
     maintainer_email='[email protected]',
     license="MIT",
@@ -40,14 +40,14 @@
     package_data={'ftfy': ['char_classes.dat']},
     install_requires=['wcwidth'],
     tests_require=['pytest'],
-    python_requires='>=3.3',
+    python_requires='>=3.4',
     classifiers=[
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.3",
         "Programming Language :: Python :: 3.4",
         "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
         "Topic :: Software Development :: Libraries :: Python Modules",
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/python-ftfy-5.5.1/tests/test_entities.py 
new/python-ftfy-5.6/tests/test_entities.py
--- old/python-ftfy-5.5.1/tests/test_entities.py        2018-09-24 
17:39:09.000000000 +0200
+++ new/python-ftfy-5.6/tests/test_entities.py  2019-08-07 22:32:36.000000000 
+0200
@@ -21,5 +21,15 @@
     assert fix_text_segment('ellipsis&#133;', normalization='NFKC') == 
'ellipsis...'
     assert fix_text_segment('ellipsis&#x85;', normalization='NFKC') == 
'ellipsis...'
     assert fix_text_segment('broken&#x81;') == 'broken\x81'
+    assert fix_text_segment('&amp;amp;amp;') == '&'
     assert unescape_html('euro &#x80;') == 'euro €'
+    assert unescape_html('EURO &EURO;') == 'EURO €'
     assert unescape_html('not an entity &#20x6;') == 'not an entity &#20x6;'
+    assert unescape_html('JEDNOCZE&SACUTE;NIE') == 'JEDNOCZEŚNIE'
+    assert unescape_html('V&SCARON;ICHNI') == 'VŠICHNI'
+    assert unescape_html('&#xffff;') == ''
+    assert unescape_html('&#xffffffff;') == '\ufffd'
+    assert (
+        fix_text_segment('this is just informal english &not html') ==
+        'this is just informal english &not html'
+    )


Reply via email to