Hello community,
here is the log from the commit of package python-html2text for
openSUSE:Leap:15.2 checked in at 2020-04-14 14:22:32
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Leap:15.2/python-html2text (Old)
and /work/SRC/openSUSE:Leap:15.2/.python-html2text.new.3248 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-html2text"
Tue Apr 14 14:22:32 2020 rev:17 rq:793670 version:2020.1.16
Changes:
--------
--- /work/SRC/openSUSE:Leap:15.2/python-html2text/python-html2text.changes
2020-04-09 12:26:38.345292918 +0200
+++
/work/SRC/openSUSE:Leap:15.2/.python-html2text.new.3248/python-html2text.changes
2020-04-14 14:24:16.617422083 +0200
@@ -1,0 +2,21 @@
+Thu Apr 9 11:17:36 UTC 2020 - Marketa Calabkova <[email protected]>
+
+- Update to 2020.1.16
+ * Add type annotations.
+ * Add support for Python 3.8.
+ * Performance improvements when ``wrap_links`` is ``False`` (the default).
+ * Configure setuptools using setup.cfg.
+
+-------------------------------------------------------------------
+Fri Dec 13 13:43:47 UTC 2019 - Matthias Fehring <[email protected]>
+
+- Update to 2019.9.26:
+ * Fix long blockquotes wrapping.
+ * Remove the trailing whitespaces that were added after wrapping list items
& blockquotes.
+ * Remove support for Python <= 3.4. Now requires Python 3.5+.
+ * Fix memory leak when processing a document containing a <abbr> tag.
+ * Fix AttributeError when reading text from stdin.
+ * Fix UnicodeEncodeError when writing output to stdout.
+- Disable build for Python 2
+
+-------------------------------------------------------------------
Old:
----
html2text-2019.8.11.tar.gz
New:
----
html2text-2020.1.16.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ python-html2text.spec ++++++
--- /var/tmp/diff_new_pack.GNvOA4/_old 2020-04-14 14:24:16.961422340 +0200
+++ /var/tmp/diff_new_pack.GNvOA4/_new 2020-04-14 14:24:16.961422340 +0200
@@ -1,7 +1,7 @@
#
# spec file for package python-html2text
#
-# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany.
+# Copyright (c) 2020 SUSE LLC
#
# All modifications and additions to the file contributed by third parties
# remain the property of their copyright owners, unless otherwise agreed
@@ -17,9 +17,10 @@
%define upname html2text
+%define skip_python2 1
%{?!python_module:%define python_module() python-%{**} python3-%{**}}
Name: python-%{upname}
-Version: 2019.8.11
+Version: 2020.1.16
Release: 0
Summary: Python script for turning HTML into Markdown text
License: GPL-3.0-only
@@ -63,6 +64,8 @@
%python_uninstall_alternative html2text
%check
+# otherwise python 3.6 does not automatically select UTF-8 for console output
+export LANG=en_US.UTF-8
%pytest
%files %{python_files}
++++++ html2text-2019.8.11.tar.gz -> html2text-2020.1.16.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/ChangeLog.rst
new/html2text-2020.1.16/ChangeLog.rst
--- old/html2text-2019.8.11/ChangeLog.rst 2019-08-11 21:33:38.000000000
+0200
+++ new/html2text-2020.1.16/ChangeLog.rst 2020-01-16 15:20:17.000000000
+0100
@@ -1,3 +1,25 @@
+2020.1.16
+=========
+----
+
+* Add type annotations.
+* Add support for Python 3.8.
+* Performance improvements when ``wrap_links`` is ``False`` (the default).
+* Configure setuptools using setup.cfg.
+
+
+2019.9.26
+=========
+----
+
+* Fix long blockquotes wrapping.
+* Remove the trailing whitespaces that were added after wrapping list items &
blockquotes.
+* Remove support for Python ≤ 3.4. Now requires Python 3.5+.
+* Fix memory leak when processing a document containing a ``<abbr>`` tag.
+* Fix ``AttributeError`` when reading text from stdin.
+* Fix ``UnicodeEncodeError`` when writing output to stdout.
+
+
2019.8.11
=========
----
@@ -10,13 +32,16 @@
* Add ``__main__.py`` module to allow running the CLI using ``python -m
html2text ...``.
* Fix #238: correct spacing when a HTML entity follows a non-stressed tags
which follow a stressed tag.
* Remove unused or deprecated:
+
* ``html2text.compat.escape()``
* ``html2text.config.RE_UNESCAPE``
* ``html2text.HTML2Text.replaceEntities()``
* ``html2text.HTML2Text.unescape()``
* ``html2text.unescape()``
+
* Fix #208: handle LEFT-TO-RIGHT MARK after a stressed tag.
+
2018.1.9
========
----
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/PKG-INFO
new/html2text-2020.1.16/PKG-INFO
--- old/html2text-2019.8.11/PKG-INFO 2019-08-11 21:36:00.000000000 +0200
+++ new/html2text-2020.1.16/PKG-INFO 2020-01-16 15:21:10.000000000 +0100
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: html2text
-Version: 2019.8.11
+Version: 2020.1.16
Summary: Turn HTML into equivalent Markdown-structured text.
Home-page: https://github.com/Alir3z4/html2text/
Author: Aaron Swartz
@@ -101,14 +101,13 @@
Classifier: License :: OSI Approved :: GNU General Public License (GPL)
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 2
-Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.4
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
-Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
+Requires-Python: >=3.5
Description-Content-Type: text/markdown
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text/__init__.py
new/html2text-2020.1.16/html2text/__init__.py
--- old/html2text-2019.8.11/html2text/__init__.py 2019-08-11
21:35:55.000000000 +0200
+++ new/html2text-2020.1.16/html2text/__init__.py 2020-01-16
15:20:17.000000000 +0100
@@ -1,14 +1,16 @@
-# coding: utf-8
"""html2text: Turn HTML into equivalent Markdown-structured text."""
-from __future__ import division, unicode_literals
+import html.entities
+import html.parser
import re
-import sys
+import urllib.parse as urlparse
from textwrap import wrap
+from typing import Dict, List, Optional, Tuple, Union
-from html2text import config
-from html2text.compat import HTMLParser, urlparse
-from html2text.utils import (
+from . import config
+from .elements import AnchorElement, ListElement
+from .typing import OutCallback
+from .utils import (
dumb_css_parser,
element_style,
escape_md,
@@ -19,38 +21,32 @@
google_text_emphasis,
hn,
list_numbering_start,
- name2cp,
pad_tables_in_text,
skipwrap,
unifiable_n,
)
-try:
- chr = unichr
- nochr = unicode("")
-except NameError:
- # python3 uses chr
- nochr = str("")
-
-__version__ = (2019, 8, 11)
+__version__ = (2020, 1, 16)
# TODO:
# Support decoded entities with UNIFIABLE.
-class HTML2Text(HTMLParser.HTMLParser):
- def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH):
+class HTML2Text(html.parser.HTMLParser):
+ def __init__(
+ self,
+ out: Optional[OutCallback] = None,
+ baseurl: str = "",
+ bodywidth: int = config.BODY_WIDTH,
+ ) -> None:
"""
Input parameters:
out: possible custom replacement for self.outtextf (which
appends lines of text).
baseurl: base URL of the document we process
"""
- kwargs = {}
- if sys.version_info >= (3, 4):
- kwargs["convert_charrefs"] = False
- HTMLParser.HTMLParser.__init__(self, **kwargs)
+ super().__init__(convert_charrefs=False)
# Config options
self.split_next_td = False
@@ -94,20 +90,20 @@
self.out = out
# empty list to store output characters before they are "joined"
- self.outtextlist = []
+ self.outtextlist = [] # type: List[str]
self.quiet = 0
self.p_p = 0 # number of newline character to print before next output
self.outcount = 0
self.start = True
self.space = False
- self.a = []
- self.astack = []
- self.maybe_automatic_link = None
+ self.a = [] # type: List[AnchorElement]
+ self.astack = [] # type: List[Optional[Dict[str, Optional[str]]]]
+ self.maybe_automatic_link = None # type: Optional[str]
self.empty_link = False
self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://")
self.acount = 0
- self.list = []
+ self.list = [] # type: List[ListElement]
self.blockquote = 0
self.pre = False
self.startpre = False
@@ -117,52 +113,57 @@
self.lastWasNL = False
self.lastWasList = False
self.style = 0
- self.style_def = {}
- self.tag_stack = []
+ self.style_def = {} # type: Dict[str, Dict[str, str]]
+ self.tag_stack = (
+ []
+ ) # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]]
self.emphasis = 0
self.drop_white_space = 0
self.inheader = False
- self.abbr_title = None # current abbreviation definition
- self.abbr_data = None # last inner HTML (for abbr being defined)
- self.abbr_list = {} # stack of abbreviations to write later
+ # Current abbreviation definition
+ self.abbr_title = None # type: Optional[str]
+ # Last inner HTML (for abbr being defined)
+ self.abbr_data = None # type: Optional[str]
+ # Stack of abbreviations to write later
+ self.abbr_list = {} # type: Dict[str, str]
self.baseurl = baseurl
self.stressed = False
self.preceding_stressed = False
- self.preceding_data = None
- self.current_tag = None
+ self.preceding_data = ""
+ self.current_tag = ""
config.UNIFIABLE["nbsp"] = " _place_holder;"
- def feed(self, data):
+ def feed(self, data: str) -> None:
data = data.replace("</' + 'script>", "</ignore>")
- HTMLParser.HTMLParser.feed(self, data)
+ super().feed(data)
- def handle(self, data):
+ def handle(self, data: str) -> str:
self.feed(data)
self.feed("")
- markdown = self.optwrap(self.close())
+ markdown = self.optwrap(self.finish())
if self.pad_tables:
return pad_tables_in_text(markdown)
else:
return markdown
- def outtextf(self, s):
+ def outtextf(self, s: str) -> None:
self.outtextlist.append(s)
if s:
self.lastWasNL = s[-1] == "\n"
- def close(self):
- HTMLParser.HTMLParser.close(self)
+ def finish(self) -> str:
+ self.close()
self.pbr()
self.o("", force="end")
- outtext = nochr.join(self.outtextlist)
+ outtext = "".join(self.outtextlist)
if self.unicode_snob:
- nbsp = chr(name2cp("nbsp"))
+ nbsp = html.entities.html5["nbsp;"]
else:
- nbsp = chr(32)
+ nbsp = " "
outtext = outtext.replace(" _place_holder;", nbsp)
# Clear self.outtextlist to avoid memory leak of its content to
@@ -171,10 +172,10 @@
return outtext
- def handle_charref(self, c):
+ def handle_charref(self, c: str) -> None:
self.handle_data(self.charref(c), True)
- def handle_entityref(self, c):
+ def handle_entityref(self, c: str) -> None:
ref = self.entityref(c)
# ref may be an empty string (e.g. for ‎/‏ markers that should
@@ -186,13 +187,13 @@
if ref:
self.handle_data(ref, True)
- def handle_starttag(self, tag, attrs):
- self.handle_tag(tag, attrs, 1)
+ def handle_starttag(self, tag: str, attrs: List[Tuple[str,
Optional[str]]]) -> None:
+ self.handle_tag(tag, dict(attrs), start=True)
- def handle_endtag(self, tag):
- self.handle_tag(tag, None, 0)
+ def handle_endtag(self, tag: str) -> None:
+ self.handle_tag(tag, {}, start=False)
- def previousIndex(self, attrs):
+ def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]:
"""
:type attrs: dict
@@ -202,17 +203,15 @@
"""
if "href" not in attrs:
return None
- i = -1
- for a in self.a:
- i += 1
- match = False
- if "href" in a and a["href"] == attrs["href"]:
- if "title" in a or "title" in attrs:
+ match = False
+ for i, a in enumerate(self.a):
+ if "href" in a.attrs and a.attrs["href"] == attrs["href"]:
+ if "title" in a.attrs or "title" in attrs:
if (
- "title" in a
+ "title" in a.attrs
and "title" in attrs
- and a["title"] == attrs["title"]
+ and a.attrs["title"] == attrs["title"]
):
match = True
else:
@@ -220,8 +219,11 @@
if match:
return i
+ return None
- def handle_emphasis(self, start, tag_style, parent_style):
+ def handle_emphasis(
+ self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str,
str]
+ ) -> None:
"""
Handles various text emphases
"""
@@ -292,13 +294,10 @@
if strikethrough:
self.quiet -= 1
- def handle_tag(self, tag, attrs, start):
+ def handle_tag(
+ self, tag: str, attrs: Dict[str, Optional[str]], start: bool
+ ) -> None:
self.current_tag = tag
- # attrs is None for endtags
- if attrs is None:
- attrs = {}
- else:
- attrs = dict(attrs)
if self.tag_callback is not None:
if self.tag_callback(self, tag, attrs, start) is True:
@@ -321,7 +320,7 @@
# need the attributes of the parent nodes in order to get a
# complete style description for the current element. we assume
# that google docs export well formed html.
- parent_style = {}
+ parent_style = {} # type: Dict[str, str]
if start:
if self.tag_stack:
parent_style = self.tag_stack[-1][2]
@@ -390,8 +389,10 @@
self.blockquote -= 1
self.p()
- def no_preceding_space(self):
- return self.preceding_data and re.match(r"[^\s]",
self.preceding_data[-1])
+ def no_preceding_space(self: HTML2Text) -> bool:
+ return bool(
+ self.preceding_data and re.match(r"[^\s]",
self.preceding_data[-1])
+ )
if tag in ["em", "i", "u"] and not self.ignore_emphasis:
if start and no_preceding_space(self):
@@ -440,9 +441,10 @@
self.abbr_title = attrs["title"]
else:
if self.abbr_title is not None:
+ assert self.abbr_data is not None
self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None
- self.abbr_data = ""
+ self.abbr_data = None
if tag == "q":
if not self.quote:
@@ -451,7 +453,7 @@
self.o(self.close_quote)
self.quote = not self.quote
- def link_url(self, link, title=""):
+ def link_url(self: HTML2Text, link: str, title: str = "") -> None:
url = urlparse.urljoin(self.baseurl, link)
title = ' "{}"'.format(title) if title.strip() else ""
self.o("]({url}{title})".format(url=escape_md(url), title=title))
@@ -476,31 +478,28 @@
if self.maybe_automatic_link and not self.empty_link:
self.maybe_automatic_link = None
elif a:
+ assert a["href"] is not None
if self.empty_link:
self.o("[")
self.empty_link = False
self.maybe_automatic_link = None
if self.inline_links:
- try:
- title = a["title"] if a["title"] else ""
- title = escape_md(title)
- except KeyError:
- link_url(self, a["href"], "")
- else:
- link_url(self, a["href"], title)
+ title = a.get("title") or ""
+ title = escape_md(title)
+ link_url(self, a["href"], title)
else:
i = self.previousIndex(a)
if i is not None:
- a = self.a[i]
+ a_props = self.a[i]
else:
self.acount += 1
- a["count"] = self.acount
- a["outcount"] = self.outcount
- self.a.append(a)
- self.o("][" + str(a["count"]) + "]")
+ a_props = AnchorElement(a, self.acount,
self.outcount)
+ self.a.append(a_props)
+ self.o("][" + str(a_props.count) + "]")
if tag == "img" and start and not self.ignore_images:
if "src" in attrs:
+ assert attrs["src"] is not None
if not self.images_to_alt:
attrs["href"] = attrs["src"]
alt = attrs.get("alt") or self.default_image_alt
@@ -512,8 +511,10 @@
):
self.o("<img src='" + attrs["src"] + "' ")
if "width" in attrs:
+ assert attrs["width"] is not None
self.o("width='" + attrs["width"] + "' ")
if "height" in attrs:
+ assert attrs["height"] is not None
self.o("height='" + attrs["height"] + "' ")
if alt:
self.o("alt='" + alt + "' ")
@@ -550,13 +551,12 @@
else:
i = self.previousIndex(attrs)
if i is not None:
- attrs = self.a[i]
+ a_props = self.a[i]
else:
self.acount += 1
- attrs["count"] = self.acount
- attrs["outcount"] = self.outcount
- self.a.append(attrs)
- self.o("[" + str(attrs["count"]) + "]")
+ a_props = AnchorElement(attrs, self.acount,
self.outcount)
+ self.a.append(a_props)
+ self.o("[" + str(a_props.count) + "]")
if tag == "dl" and start:
self.p()
@@ -569,7 +569,7 @@
if tag in ["ol", "ul"]:
# Google Docs create sub lists as top level lists
- if (not self.list) and (not self.lastWasList):
+ if not self.list and not self.lastWasList:
self.p()
if start:
if self.google_doc:
@@ -577,11 +577,11 @@
else:
list_style = tag
numbering_start = list_numbering_start(attrs)
- self.list.append({"name": list_style, "num": numbering_start})
+ self.list.append(ListElement(list_style, numbering_start))
else:
if self.list:
self.list.pop()
- if (not self.google_doc) and (not self.list):
+ if not self.google_doc and not self.list:
self.o("\n")
self.lastWasList = True
else:
@@ -593,18 +593,18 @@
if self.list:
li = self.list[-1]
else:
- li = {"name": "ul", "num": 0}
+ li = ListElement("ul", 0)
if self.google_doc:
nest_count = self.google_nest_count(tag_style)
else:
nest_count = len(self.list)
# TODO: line up <ol><li>s > 9 correctly.
self.o(" " * nest_count)
- if li["name"] == "ul":
+ if li.name == "ul":
self.o(self.ul_item_mark + " ")
- elif li["name"] == "ol":
- li["num"] += 1
- self.o(str(li["num"]) + ". ")
+ elif li.name == "ol":
+ li.num += 1
+ self.o(str(li.num) + ". ")
self.start = True
if tag in ["table", "tr", "td", "th"]:
@@ -671,21 +671,23 @@
self.p()
# TODO: Add docstring for these one letter functions
- def pbr(self):
+ def pbr(self) -> None:
"Pretty print has a line break"
if self.p_p == 0:
self.p_p = 1
- def p(self):
+ def p(self) -> None:
"Set pretty print to 1 or 2 lines"
self.p_p = 1 if self.single_line_break else 2
- def soft_br(self):
+ def soft_br(self) -> None:
"Soft breaks"
self.pbr()
self.br_toggle = " "
- def o(self, data, puredata=False, force=False):
+ def o(
+ self, data: str, puredata: bool = False, force: Union[bool, str] =
False
+ ) -> None:
"""
Deal with indentation and whitespace
"""
@@ -730,8 +732,7 @@
if not self.list:
bq += " "
# else: list content is already partially indented
- for i in range(len(self.list)):
- bq += " "
+ bq += " " * len(self.list)
data = data.replace("\n", "\n" + bq)
if self.startpre:
@@ -769,15 +770,16 @@
newa = []
for link in self.a:
- if self.outcount > link["outcount"]:
+ if self.outcount > link.outcount:
self.out(
" ["
- + str(link["count"])
+ + str(link.count)
+ "]: "
- + urlparse.urljoin(self.baseurl, link["href"])
+ + urlparse.urljoin(self.baseurl,
link.attrs["href"])
)
- if "title" in link:
- self.out(" (" + link["title"] + ")")
+ if "title" in link.attrs:
+ assert link.attrs["title"] is not None
+ self.out(" (" + link.attrs["title"] + ")")
self.out("\n")
else:
newa.append(link)
@@ -796,7 +798,7 @@
self.out(data)
self.outcount += 1
- def handle_data(self, data, entity_char=False):
+ def handle_data(self, data: str, entity_char: bool = False) -> None:
if not data:
# Data may be empty for some HTML entities. For example,
# LEFT-TO-RIGHT MARK.
@@ -839,7 +841,7 @@
self.preceding_data = data
self.o(data, puredata=True)
- def charref(self, name):
+ def charref(self, name: str) -> str:
if name[0] in ["x", "X"]:
c = int(name[1:], 16)
else:
@@ -853,21 +855,16 @@
except ValueError: # invalid unicode
return ""
- def entityref(self, c):
+ def entityref(self, c: str) -> str:
if not self.unicode_snob and c in config.UNIFIABLE:
return config.UNIFIABLE[c]
- else:
- try:
- cp = name2cp(c)
- except KeyError:
- return "&" + c + ";"
- else:
- if c == "nbsp":
- return config.UNIFIABLE[c]
- else:
- return chr(cp)
+ try:
+ ch = html.entities.html5[c + ";"]
+ except KeyError:
+ return "&" + c + ";"
+ return config.UNIFIABLE[c] if c == "nbsp" else ch
- def google_nest_count(self, style):
+ def google_nest_count(self, style: Dict[str, str]) -> int:
"""
Calculate the nesting count of google doc lists
@@ -881,7 +878,7 @@
return nest_count
- def optwrap(self, text):
+ def optwrap(self, text: str) -> str:
"""
Wrap all paragraphs in the provided text.
@@ -904,7 +901,13 @@
if not skipwrap(para, self.wrap_links, self.wrap_list_items):
indent = ""
if para.startswith(" " + self.ul_item_mark):
- indent = " " # For list items.
+ # list item continuation: add a double indent to the
+ # new lines
+ indent = " "
+ elif para.startswith("> "):
+ # blockquote continuation: add the greater than symbol
+ # to the new lines
+ indent = "> "
wrapped = wrap(
para,
self.body_width,
@@ -912,9 +915,12 @@
subsequent_indent=indent,
)
result += "\n".join(wrapped)
- if indent or para.endswith(" "):
+ if para.endswith(" "):
result += " \n"
newlines = 1
+ elif indent:
+ result += "\n"
+ newlines = 1
else:
result += "\n\n"
newlines = 2
@@ -933,7 +939,7 @@
return result
-def html2text(html, baseurl="", bodywidth=None):
+def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None)
-> str:
if bodywidth is None:
bodywidth = config.BODY_WIDTH
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text/__main__.py
new/html2text-2020.1.16/html2text/__main__.py
--- old/html2text-2019.8.11/html2text/__main__.py 2019-08-11
21:27:39.000000000 +0200
+++ new/html2text-2020.1.16/html2text/__main__.py 2019-10-12
17:55:30.000000000 +0200
@@ -1,3 +1,3 @@
-from html2text.cli import main
+from .cli import main
main()
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text/cli.py
new/html2text-2020.1.16/html2text/cli.py
--- old/html2text-2019.8.11/html2text/cli.py 2019-08-11 21:27:39.000000000
+0200
+++ new/html2text-2020.1.16/html2text/cli.py 2019-10-12 18:20:41.000000000
+0200
@@ -1,10 +1,10 @@
import argparse
+import sys
-from html2text import HTML2Text, __version__, config
-from html2text.utils import wrap_read, wrapwrite
+from . import HTML2Text, __version__, config
-def main():
+def main() -> None:
baseurl = ""
class bcolors:
@@ -256,10 +256,10 @@
with open(args.filename, "rb") as fp:
data = fp.read()
else:
- data = wrap_read()
+ data = sys.stdin.buffer.read()
try:
- data = data.decode(args.encoding, args.decode_errors)
+ html = data.decode(args.encoding, args.decode_errors)
except UnicodeDecodeError as err:
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
warning += " Use the " + bcolors.OKGREEN
@@ -303,4 +303,4 @@
h.open_quote = args.open_quote
h.close_quote = args.close_quote
- wrapwrite(h.handle(data))
+ sys.stdout.write(h.handle(html))
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text/compat.py
new/html2text-2020.1.16/html2text/compat.py
--- old/html2text-2019.8.11/html2text/compat.py 2019-08-11 21:27:39.000000000
+0200
+++ new/html2text-2020.1.16/html2text/compat.py 1970-01-01 01:00:00.000000000
+0100
@@ -1,12 +0,0 @@
-import sys
-
-if sys.version_info[0] == 2:
- import htmlentitydefs
- import urlparse
- import HTMLParser
-else:
- import urllib.parse as urlparse
- import html.entities as htmlentitydefs
- import html.parser as HTMLParser
-
-__all__ = ["HTMLParser", "htmlentitydefs", "urlparse"]
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text/config.py
new/html2text-2020.1.16/html2text/config.py
--- old/html2text-2019.8.11/html2text/config.py 2019-08-11 21:27:39.000000000
+0200
+++ new/html2text-2020.1.16/html2text/config.py 2019-08-15 12:56:54.000000000
+0200
@@ -1,5 +1,3 @@
-from __future__ import unicode_literals
-
import re
# Use Unicode characters instead of their ascii pseudo-replacements
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text/elements.py
new/html2text-2020.1.16/html2text/elements.py
--- old/html2text-2019.8.11/html2text/elements.py 1970-01-01
01:00:00.000000000 +0100
+++ new/html2text-2020.1.16/html2text/elements.py 2019-10-12
18:20:41.000000000 +0200
@@ -0,0 +1,18 @@
+from typing import Dict, Optional
+
+
+class AnchorElement:
+ __slots__ = ["attrs", "count", "outcount"]
+
+ def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount:
int):
+ self.attrs = attrs
+ self.count = count
+ self.outcount = outcount
+
+
+class ListElement:
+ __slots__ = ["name", "num"]
+
+ def __init__(self, name: str, num: int):
+ self.name = name
+ self.num = num
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text/typing.py
new/html2text-2020.1.16/html2text/typing.py
--- old/html2text-2019.8.11/html2text/typing.py 1970-01-01 01:00:00.000000000
+0100
+++ new/html2text-2020.1.16/html2text/typing.py 2019-10-12 18:20:41.000000000
+0200
@@ -0,0 +1,3 @@
+class OutCallback:
+ def __call__(self, s: str) -> None:
+ ...
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text/utils.py
new/html2text-2020.1.16/html2text/utils.py
--- old/html2text-2019.8.11/html2text/utils.py 2019-08-11 21:27:39.000000000
+0200
+++ new/html2text-2020.1.16/html2text/utils.py 2020-01-16 15:08:28.000000000
+0100
@@ -1,20 +1,16 @@
-import sys
+import html.entities
+from typing import Dict, List, Optional
-from html2text import config
-from html2text.compat import htmlentitydefs
+from . import config
+unifiable_n = {
+ html.entities.name2codepoint[k]: v
+ for k, v in config.UNIFIABLE.items()
+ if k != "nbsp"
+}
-def name2cp(k):
- """Return sname to codepoint"""
- if k == "apos":
- return ord("'")
- return htmlentitydefs.name2codepoint[k]
-
-unifiable_n = {name2cp(k): v for k, v in config.UNIFIABLE.items() if k !=
"nbsp"}
-
-
-def hn(tag):
+def hn(tag: str) -> int:
if tag[0] == "h" and len(tag) == 2:
n = tag[1]
if "0" < n <= "9":
@@ -22,7 +18,7 @@
return 0
-def dumb_property_dict(style):
+def dumb_property_dict(style: str) -> Dict[str, str]:
"""
:returns: A hash of css attributes
"""
@@ -32,7 +28,7 @@
}
-def dumb_css_parser(data):
+def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
"""
:type data: str
@@ -49,16 +45,20 @@
# parse the css. reverted from dictionary comprehension in order to
# support older pythons
- elements = [x.split("{") for x in data.split("}") if "{" in x.strip()]
+ pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
try:
- elements = {a.strip(): dumb_property_dict(b) for a, b in elements}
+ elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
except ValueError:
elements = {} # not that important
return elements
-def element_style(attrs, style_def, parent_style):
+def element_style(
+ attrs: Dict[str, Optional[str]],
+ style_def: Dict[str, Dict[str, str]],
+ parent_style: Dict[str, str],
+) -> Dict[str, str]:
"""
:type attrs: dict
:type style_def: dict
@@ -69,17 +69,19 @@
"""
style = parent_style.copy()
if "class" in attrs:
+ assert attrs["class"] is not None
for css_class in attrs["class"].split():
css_style = style_def.get("." + css_class, {})
style.update(css_style)
if "style" in attrs:
+ assert attrs["style"] is not None
immediate_style = dumb_property_dict(attrs["style"])
style.update(immediate_style)
return style
-def google_list_style(style):
+def google_list_style(style: Dict[str, str]) -> str:
"""
Finds out whether this is an ordered or unordered list
@@ -95,7 +97,7 @@
return "ol"
-def google_has_height(style):
+def google_has_height(style: Dict[str, str]) -> bool:
"""
Check if the style of the element has the 'height' attribute
explicitly defined
@@ -107,7 +109,7 @@
return "height" in style
-def google_text_emphasis(style):
+def google_text_emphasis(style: Dict[str, str]) -> List[str]:
"""
:type style: dict
@@ -125,7 +127,7 @@
return emphasis
-def google_fixed_width_font(style):
+def google_fixed_width_font(style: Dict[str, str]) -> bool:
"""
Check if the css of the current element defines a fixed width font
@@ -139,7 +141,7 @@
return "courier new" == font_family or "consolas" == font_family
-def list_numbering_start(attrs):
+def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
"""
Extract numbering from list element attributes
@@ -148,6 +150,7 @@
:rtype: int or None
"""
if "start" in attrs:
+ assert attrs["start"] is not None
try:
return int(attrs["start"]) - 1
except ValueError:
@@ -156,10 +159,10 @@
return 0
-def skipwrap(para, wrap_links, wrap_list_items):
+def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
# If it appears to contain a link
# don't wrap
- if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links:
+ if not wrap_links and config.RE_LINK.search(para):
return True
# If the text begins with four spaces or one tab, it's a code block;
# don't wrap
@@ -187,25 +190,7 @@
)
-def wrapwrite(text):
- text = text.encode("utf-8")
- try: # Python3
- sys.stdout.buffer.write(text)
- except AttributeError:
- sys.stdout.write(text)
-
-
-def wrap_read():
- """
- :rtype: str
- """
- try:
- return sys.stdin.read()
- except AttributeError:
- return sys.stdin.buffer.read()
-
-
-def escape_md(text):
+def escape_md(text: str) -> str:
"""
Escapes markdown-sensitive characters within other markdown
constructs.
@@ -213,7 +198,7 @@
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
-def escape_md_section(text, snob=False):
+def escape_md_section(text: str, snob: bool = False) -> str:
"""
Escapes markdown-sensitive characters across whole document sections.
"""
@@ -229,7 +214,7 @@
return text
-def reformat_table(lines, right_margin):
+def reformat_table(lines: List[str], right_margin: int) -> List[str]:
"""
Given the lines of a table
padds the cells and returns the new lines
@@ -272,12 +257,13 @@
return new_lines
-def pad_tables_in_text(text, right_margin=1):
+def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
"""
Provide padding for tables in the text
"""
lines = text.split("\n")
- table_buffer, table_started = [], False
+ table_buffer = [] # type: List[str]
+ table_started = False
new_lines = []
for line in lines:
# Toggle table started
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text.egg-info/PKG-INFO
new/html2text-2020.1.16/html2text.egg-info/PKG-INFO
--- old/html2text-2019.8.11/html2text.egg-info/PKG-INFO 2019-08-11
21:35:58.000000000 +0200
+++ new/html2text-2020.1.16/html2text.egg-info/PKG-INFO 2020-01-16
15:21:10.000000000 +0100
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: html2text
-Version: 2019.8.11
+Version: 2020.1.16
Summary: Turn HTML into equivalent Markdown-structured text.
Home-page: https://github.com/Alir3z4/html2text/
Author: Aaron Swartz
@@ -101,14 +101,13 @@
Classifier: License :: OSI Approved :: GNU General Public License (GPL)
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 2
-Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.4
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
-Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
+Requires-Python: >=3.5
Description-Content-Type: text/markdown
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text.egg-info/SOURCES.txt
new/html2text-2020.1.16/html2text.egg-info/SOURCES.txt
--- old/html2text-2019.8.11/html2text.egg-info/SOURCES.txt 2019-08-11
21:35:59.000000000 +0200
+++ new/html2text-2020.1.16/html2text.egg-info/SOURCES.txt 2020-01-16
15:21:10.000000000 +0100
@@ -9,13 +9,16 @@
html2text/__init__.py
html2text/__main__.py
html2text/cli.py
-html2text/compat.py
html2text/config.py
+html2text/elements.py
+html2text/py.typed
+html2text/typing.py
html2text/utils.py
html2text.egg-info/PKG-INFO
html2text.egg-info/SOURCES.txt
html2text.egg-info/dependency_links.txt
html2text.egg-info/entry_points.txt
+html2text.egg-info/not-zip-safe
html2text.egg-info/top_level.txt
test/GoogleDocMassDownload.html
test/GoogleDocMassDownload.md
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/html2text.egg-info/not-zip-safe
new/html2text-2020.1.16/html2text.egg-info/not-zip-safe
--- old/html2text-2019.8.11/html2text.egg-info/not-zip-safe 1970-01-01
01:00:00.000000000 +0100
+++ new/html2text-2020.1.16/html2text.egg-info/not-zip-safe 2020-01-16
15:21:10.000000000 +0100
@@ -0,0 +1 @@
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/setup.cfg
new/html2text-2020.1.16/setup.cfg
--- old/html2text-2019.8.11/setup.cfg 2019-08-11 21:36:00.000000000 +0200
+++ new/html2text-2020.1.16/setup.cfg 2020-01-16 15:21:10.000000000 +0100
@@ -1,5 +1,42 @@
-[bdist_wheel]
-universal = 1
+[metadata]
+name = html2text
+version = attr: html2text.__version__
+description = Turn HTML into equivalent Markdown-structured text.
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/Alir3z4/html2text/
+author = Aaron Swartz
+author_email = [email protected]
+maintainer = Alireza Savand
+maintainer_email = [email protected]
+license = GNU GPL 3
+classifiers =
+ Development Status :: 5 - Production/Stable
+ Intended Audience :: Developers
+ License :: OSI Approved :: GNU General Public License (GPL)
+ Operating System :: OS Independent
+ Programming Language :: Python
+ Programming Language :: Python :: 3
+ Programming Language :: Python :: 3.5
+ Programming Language :: Python :: 3.6
+ Programming Language :: Python :: 3.7
+ Programming Language :: Python :: 3.8
+ Programming Language :: Python :: 3 :: Only
+ Programming Language :: Python :: Implementation :: CPython
+ Programming Language :: Python :: Implementation :: PyPy
+platform = OS Independent
+
+[options]
+zip_safe = False
+packages = html2text
+python_requires = >=3.5
+
+[options.entry_points]
+console_scripts =
+ html2text = html2text.cli:main
+
+[options.package_data]
+html2text = py.typed
[flake8]
max_line_length = 88
@@ -13,6 +50,9 @@
line_length = 88
multi_line_output = 3
+[mypy]
+python_version = 3.5
+
[egg_info]
tag_build =
tag_date = 0
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/setup.py
new/html2text-2020.1.16/setup.py
--- old/html2text-2019.8.11/setup.py 2019-08-11 21:27:39.000000000 +0200
+++ new/html2text-2020.1.16/setup.py 2019-10-31 19:37:31.000000000 +0100
@@ -1,42 +1,3 @@
-# coding: utf-8
from setuptools import setup
-
-def readall(f):
- with open(f) as fp:
- return fp.read()
-
-
-setup(
- name="html2text",
- version=".".join(map(str, __import__("html2text").__version__)),
- description="Turn HTML into equivalent Markdown-structured text.",
- long_description=readall("README.md"),
- long_description_content_type="text/markdown",
- author="Aaron Swartz",
- author_email="[email protected]",
- maintainer="Alireza Savand",
- maintainer_email="[email protected]",
- url="https://github.com/Alir3z4/html2text/",
- platforms="OS Independent",
- classifiers=[
- "Development Status :: 5 - Production/Stable",
- "Intended Audience :: Developers",
- "License :: OSI Approved :: GNU General Public License (GPL)",
- "Operating System :: OS Independent",
- "Programming Language :: Python",
- "Programming Language :: Python :: 2",
- "Programming Language :: Python :: 2.7",
- "Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.4",
- "Programming Language :: Python :: 3.5",
- "Programming Language :: Python :: 3.6",
- "Programming Language :: Python :: 3.7",
- "Programming Language :: Python :: Implementation :: CPython",
- "Programming Language :: Python :: Implementation :: PyPy",
- ],
- python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*",
- entry_points={"console_scripts": ["html2text = html2text.cli:main"]},
- license="GNU GPL 3",
- packages=["html2text"],
-)
+setup()
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/test/blockquote_example.html
new/html2text-2020.1.16/test/blockquote_example.html
--- old/html2text-2019.8.11/test/blockquote_example.html 2019-08-11
21:27:39.000000000 +0200
+++ new/html2text-2020.1.16/test/blockquote_example.html 2019-08-15
12:56:54.000000000 +0200
@@ -1,3 +1,3 @@
<blockquote>
-The time has come, the Walrus said, to speak of many things.
+"The time has come", the Walrus said, "To talk of many things: Of shoes - and
ships - and sealing wax - Of cabbages - and kings- And why the sea is boiling
hot - And whether pigs have wings."
</blockquote>
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/test/blockquote_example.md
new/html2text-2020.1.16/test/blockquote_example.md
--- old/html2text-2019.8.11/test/blockquote_example.md 2019-08-11
21:27:39.000000000 +0200
+++ new/html2text-2020.1.16/test/blockquote_example.md 2019-09-25
10:07:55.000000000 +0200
@@ -1,2 +1,4 @@
-> The time has come, the Walrus said, to speak of many things.
+> "The time has come", the Walrus said, "To talk of many things: Of shoes -
+> and ships - and sealing wax - Of cabbages - and kings- And why the sea is
+> boiling hot - And whether pigs have wings."
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/test/test_html2text.py
new/html2text-2020.1.16/test/test_html2text.py
--- old/html2text-2019.8.11/test/test_html2text.py 2019-08-11
21:27:39.000000000 +0200
+++ new/html2text-2020.1.16/test/test_html2text.py 2020-01-16
15:08:28.000000000 +0100
@@ -1,4 +1,3 @@
-import codecs
import glob
import os
import re
@@ -41,8 +40,7 @@
if base_fn.find("unicode") >= 0:
module_args["unicode_snob"] = True
- # There is no command-line option to control unicode_snob.
- cmdline_args = skip
+ cmdline_args.append("--unicode-snob")
func_args = skip
if base_fn.find("flip_emphasis") >= 0:
@@ -189,7 +187,7 @@
result = get_baseline(fn)
out = subprocess.check_output(cmd)
- actual = out.decode("utf8")
+ actual = out.decode()
actual = cleanup_eol(actual)
@@ -210,7 +208,7 @@
def get_baseline(fn):
name = get_baseline_name(fn)
- with codecs.open(name, mode="r", encoding="utf8") as f:
+ with open(name, encoding="utf-8") as f:
out = f.read()
return cleanup_eol(out)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/test/test_memleak.py
new/html2text-2020.1.16/test/test_memleak.py
--- old/html2text-2019.8.11/test/test_memleak.py 2019-08-11
21:27:39.000000000 +0200
+++ new/html2text-2020.1.16/test/test_memleak.py 2019-09-25
09:41:57.000000000 +0200
@@ -17,3 +17,10 @@
h2t.handle(INSTR)
# And even less when the input is empty.
assert h2t.handle("") == "\n\n"
+
+
+def test_abbr_data():
+ h2t = html2text.HTML2Text()
+ result = h2t.handle('<p>foo <abbr title="Three Letter Acronym">TLA</abbr>
bar</p>')
+ assert result == "foo TLA bar\n\n *[TLA]: Three Letter Acronym\n\n"
+ assert h2t.abbr_data is None
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/test/wrap_list_items_example.md
new/html2text-2020.1.16/test/wrap_list_items_example.md
--- old/html2text-2019.8.11/test/wrap_list_items_example.md 2019-08-11
21:27:39.000000000 +0200
+++ new/html2text-2020.1.16/test/wrap_list_items_example.md 2019-09-25
10:07:55.000000000 +0200
@@ -1,14 +1,14 @@
* One two three four five six seven eight nine ten eleven twelve thirteen
- fourteen fifteen sixteen seventeen eighteen nineteen twenty.
+ fourteen fifteen sixteen seventeen eighteen nineteen twenty.
* One two three four five six seven eight nine ten eleven twelve thirteen
- fourteen fifteen sixteen seventeen eighteen nineteen twenty.
+ fourteen fifteen sixteen seventeen eighteen nineteen twenty.
Text between lists.
* One two three four five six seven eight nine ten eleven twelve thirteen
- fourteen fifteen sixteen seventeen eighteen nineteen twenty.
+ fourteen fifteen sixteen seventeen eighteen nineteen twenty.
* One two three four five six seven eight nine ten eleven twelve thirteen
- fourteen fifteen sixteen seventeen eighteen nineteen twenty.
+ fourteen fifteen sixteen seventeen eighteen nineteen twenty.
Text after list.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html2text-2019.8.11/tox.ini
new/html2text-2020.1.16/tox.ini
--- old/html2text-2019.8.11/tox.ini 2019-08-11 21:27:39.000000000 +0200
+++ new/html2text-2020.1.16/tox.ini 2019-10-31 19:37:31.000000000 +0100
@@ -3,7 +3,8 @@
black
flake8
isort
- py{27,34,35,36,37,py,py3}
+ mypy
+ py{35,36,37,38,py3}
minversion = 1.9
[testenv]
@@ -16,7 +17,7 @@
[testenv:black]
basepython = python3
commands =
- black --check --diff .
+ black --target-version py35 --check --diff .
deps =
black
skip_install = true
@@ -36,3 +37,8 @@
deps =
isort
skip_install = true
+
+[testenv:mypy]
+commands = mypy --strict html2text
+deps = mypy
+skip_install = true