Hello community, here is the log from the commit of package python-html2text for openSUSE:Factory checked in at 2018-08-24 17:04:32 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-html2text (Old) and /work/SRC/openSUSE:Factory/.python-html2text.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-html2text" Fri Aug 24 17:04:32 2018 rev:18 rq:629541 version:2018.1.9 Changes: -------- --- /work/SRC/openSUSE:Factory/python-html2text/python-html2text.changes 2017-05-02 08:54:48.643017336 +0200 +++ /work/SRC/openSUSE:Factory/.python-html2text.new/python-html2text.changes 2018-08-24 17:04:34.322104714 +0200 @@ -1,0 +2,36 @@ +Thu Aug 16 10:23:31 UTC 2018 - jeng...@inai.de + +- Use noun phrase for summary. Ensure accuracy and + neutrality of description. + +------------------------------------------------------------------- +Mon Aug 13 11:39:14 UTC 2018 - mc...@suse.com + +- Upgrade to 2018.1.9: + Fix #188: Non-ASCII in title attribute causes encode error. + Feature #194: Add support for the <kbd> tag. + Feature #193: Add support for the <q> tag. + Fix #157: Fix images link with div wrap + Fix #55: Fix error when empty title tags + Fix #160: The html2text tests are failing on Windows and on + Cygwin due to differences in eol handling between + Windows/*nix + Feature #164: Housekeeping: Add flake8 to the travis build, + cleanup existing flake8 violations, add py3.6 and pypy3 + to the travis build + Fix #109: Fix for unexpanded < > & + Fix #143: Fix line wrapping for the lines starting with bold + Adds support for numeric bold text indication in font-weight, + as used by Google (and presumably others.) + Fix #173 and #142: Stripping whitespace in crucial markdown + and adding whitespace as necessary + Don't drop any cell data on tables uneven row lengths + (e.g. colspan in use) + +------------------------------------------------------------------- +Mon Aug 13 08:38:04 UTC 2018 - mc...@suse.com + +- Remove dependency on unittetst2 + Added remove_unittest2.patch to facilitate that + +------------------------------------------------------------------- Old: ---- html2text-2016.9.19.tar.gz New: ---- html2text-2018.1.9.tar.gz remove_unittest2.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-html2text.spec ++++++ --- /var/tmp/diff_new_pack.GorObB/_old 2018-08-24 17:04:35.334105915 +0200 +++ /var/tmp/diff_new_pack.GorObB/_new 2018-08-24 17:04:35.338105920 +0200 @@ -1,7 +1,7 @@ # # spec file for package python-html2text # -# Copyright (c) 2017 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2018 SUSE LINUX GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -16,39 +16,38 @@ # -%bcond_without tests +# +%define upname html2text %{?!python_module:%define python_module() python-%{**} python3-%{**}} -Name: python-html2text -Version: 2016.9.19 +%bcond_without tests +Name: python-%{upname} +Version: 2018.1.9 Release: 0 -Url: https://github.com/Alir3z4/html2text/ -Summary: Turn HTML into equivalent Markdown-structured text -License: GPL-3.0 +Summary: Python script for turning HTML into Markdown text +License: GPL-3.0-only Group: Development/Languages/Python -Source: https://files.pythonhosted.org/packages/source/h/html2text/html2text-%{version}.tar.gz -BuildRoot: %{_tmppath}/%{name}-%{version}-build -BuildRequires: fdupes -BuildRequires: python-rpm-macros +URL: https://github.com/Alir3z4/html2text/ +Source: https://files.pythonhosted.org/packages/source/h/%{upname}/%{upname}-%{version}.tar.gz +Patch0: remove_unittest2.patch BuildRequires: %{python_module devel} BuildRequires: %{python_module setuptools} -%if %{with tests} -BuildRequires: python2-unittest2 -%endif +BuildRequires: fdupes +BuildRequires: python-rpm-macros Requires(post): update-alternatives Requires(preun): update-alternatives BuildArch: noarch %python_subpackages %description -html2text is a Python script that converts a page of HTML into clean, -easy-to-read plain ASCII text. Better yet, that ASCII also happens to -be valid Markdown (a text-to-HTML format). +html2text is a Python script that converts a page of HTML into +Markdown (a text-to-HTML format). %prep -%setup -q -n html2text-%{version} +%setup -q -n %{upname}-%{version} +%patch0 -p1 # remove useless shebang -sed -i '/^#!/d' html2text/__init__.py +sed -i '/^#!/d' %{upname}/__init__.py %build %python_build @@ -57,29 +56,26 @@ %python_install %python_expand %fdupes %{buildroot}%{$python_sitelib} -# To avoid conflicts with the rst2html5 package -mv %{buildroot}%{_bindir}/html2text %{buildroot}%{_bindir}/html2text-python -ln -s -f %{_sysconfdir}/alternatives/html2text %{buildroot}%{_bindir}/html2text +%python_clone -a %{buildroot}%{_bindir}/%{upname} + +# remove executable bits from egg files +%python_expand chmod -x %{buildroot}%{$python_sitelib}/%{upname}-*.egg-info/* %post -update-alternatives --install %{_bindir}/html2text html2text %{_bindir}/html2text-python 15 +%python_install_alternative html2text -%preun -if [ ! -f %{_bindir}/html2text-python ] ; then - update-alternatives --remove html2text %{_bindir}/html2text-python -fi +%postun +%python_uninstall_alternative html2text %if %{with tests} %check %python_exec setup.py test %endif -%files %python_files -%defattr(-,root,root,-) -%doc COPYING README.md AUTHORS.rst ChangeLog.rst -%python3_only %{_bindir}/html2text -%python3_only %{_bindir}/html2text-python -%python3_only %ghost %{_sysconfdir}/alternatives/html2text +%files %{python_files} +%license COPYING +%doc README.md AUTHORS.rst ChangeLog.rst +%python_alternative %{_bindir}/%{upname} %{python_sitelib}/* %changelog ++++++ html2text-2016.9.19.tar.gz -> html2text-2018.1.9.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/AUTHORS.rst new/html2text-2018.1.9/AUTHORS.rst --- old/html2text-2016.9.19/AUTHORS.rst 2016-05-29 18:08:48.000000000 +0200 +++ new/html2text-2018.1.9/AUTHORS.rst 2018-01-09 05:43:43.000000000 +0100 @@ -20,7 +20,13 @@ * Etienne Millon <m...@emillon.org> * John C F <gh: critiqjo> * Mikhail Melnik <by.zumz...@gmail.com> - +* Andres Rey +* Ciprian Miclaus +* Toshihiro Kamiya <kam...@mbj.nifty.com> +* Matt Dennewitz <mattdennew...@gmail.com> +* Jonathan Sundqvist <sundqvist.jonat...@gmail.com> +* Simon Meers <gh: DrMeers> +* Kurt McKee <contac...@kurtmckee.org> Maintainer: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/ChangeLog.rst new/html2text-2018.1.9/ChangeLog.rst --- old/html2text-2016.9.19/ChangeLog.rst 2016-09-19 00:03:35.000000000 +0200 +++ new/html2text-2018.1.9/ChangeLog.rst 2018-01-09 05:43:43.000000000 +0100 @@ -1,3 +1,28 @@ +2018.9.1 +======== +---- + +* Fix #188: Non-ASCII in title attribute causes encode error. +* Feature #194: Add support for the <kbd> tag. +* Feature #193: Add support for the <q> tag. + + +2017.10.4 +========== +---- + +* Fix #157: Fix images link with div wrap +* Fix #55: Fix error when empty title tags +* Fix #160: The html2text tests are failing on Windows and on Cygwin due to differences in eol handling between windows/*nix +* Feature #164: Housekeeping: Add flake8 to the travis build, cleanup existing flake8 violations, add py3.6 and pypy3 to the travis build +* Fix #109: Fix for unexpanded < > & +* Fix #143: Fix line wrapping for the lines starting with bold +* Adds support for numeric bold text indication in `font-weight`, + as used by Google (and presumably others.) +* Fix #173 and #142: Stripping whitespace in crucial markdown and adding whitespace as necessary +* Don't drop any cell data on tables uneven row lengths (e.g. colspan in use) + + 2016.9.19 ========= ---- @@ -124,7 +149,7 @@ ========== ---- -* Feature #49: Added a images_to_alt option to discard images and keep only their alt. +* Feature #49: Added an images_to_alt option to discard images and keep only their alt. * Feature #50: Protect links, surrounding them with angle brackets to avoid breaking... * Feature: Add ``setup.cfg`` file. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/PKG-INFO new/html2text-2018.1.9/PKG-INFO --- old/html2text-2016.9.19/PKG-INFO 2016-09-19 00:08:46.000000000 +0200 +++ new/html2text-2018.1.9/PKG-INFO 2018-01-10 07:03:39.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: html2text -Version: 2016.9.19 +Version: 2018.1.9 Summary: Turn HTML into equivalent Markdown-structured text. Home-page: https://github.com/Alir3z4/html2text/ Author: Alireza Savand @@ -141,3 +141,4 @@ Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/html2text/__init__.py new/html2text-2018.1.9/html2text/__init__.py --- old/html2text-2016.9.19/html2text/__init__.py 2016-09-19 00:03:55.000000000 +0200 +++ new/html2text-2018.1.9/html2text/__init__.py 2018-01-10 06:58:34.000000000 +0100 @@ -2,6 +2,7 @@ # coding: utf-8 """html2text: Turn HTML into equivalent Markdown-structured text.""" from __future__ import division +from __future__ import unicode_literals import re import sys @@ -10,7 +11,7 @@ except ImportError: # pragma: no cover pass -from html2text.compat import urlparse, HTMLParser, html_escape +from html2text.compat import urlparse, HTMLParser from html2text import config from html2text.utils import ( @@ -30,7 +31,14 @@ pad_tables_in_text ) -__version__ = (2016, 9, 19) +try: + chr = unichr + nochr = unicode('') +except NameError: + # python3 uses chr + nochr = str('') + +__version__ = (2018, 1, 9) # TODO: @@ -81,6 +89,8 @@ self.pad_tables = config.PAD_TABLES # covered in cli self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli self.tag_callback = None + self.open_quote = config.OPEN_QUOTE # covered in cli + self.close_quote = config.CLOSE_QUOTE # covered in cli if out is None: # pragma: no cover self.out = self.outtextf @@ -106,6 +116,7 @@ self.pre = 0 self.startpre = 0 self.code = False + self.quote = False self.br_toggle = '' self.lastWasNL = 0 self.lastWasList = False @@ -119,6 +130,10 @@ self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl + self.stressed = False + self.preceding_stressed = False + self.preceding_data = None + self.current_tag = None try: del unifiable_n[name2cp('nbsp')] @@ -147,22 +162,15 @@ def close(self): HTMLParser.HTMLParser.close(self) - try: - nochr = unicode('') - unicode_character = unichr - except NameError: - nochr = str('') - unicode_character = chr - self.pbr() self.o('', 0, 'end') outtext = nochr.join(self.outtextlist) if self.unicode_snob: - nbsp = unicode_character(name2cp('nbsp')) + nbsp = chr(name2cp('nbsp')) else: - nbsp = unicode_character(32) + nbsp = chr(32) try: outtext = outtext.replace(unicode(' _place_holder;'), nbsp) except NameError: @@ -175,17 +183,10 @@ return outtext def handle_charref(self, c): - charref = self.charref(c) - if not self.code and not self.pre: - charref = html_escape(charref) - self.handle_data(charref, True) + self.handle_data(self.charref(c), True) def handle_entityref(self, c): - entityref = self.entityref(c) - if (not self.code and not self.pre - and entityref != ' _place_holder;'): - entityref = html_escape(entityref) - self.handle_data(entityref, True) + self.handle_data(self.entityref(c), True) def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) @@ -208,10 +209,11 @@ i += 1 match = 0 - if ('href' in a) and a['href'] == attrs['href']: - if ('title' in a) or ('title' in attrs): - if (('title' in a) and ('title' in attrs) and - a['title'] == attrs['title']): + if 'href' in a and a['href'] == attrs['href']: + if 'title' in a or 'title' in attrs: + if 'title' in a and \ + 'title' in attrs and \ + a['title'] == attrs['title']: match = True else: match = True @@ -229,8 +231,16 @@ # handle Google's text emphasis strikethrough = 'line-through' in \ tag_emphasis and self.hide_strikethrough - bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis - italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis + + # google and others may mark a font's weight as `bold` or `700` + bold = False + for bold_marker in config.BOLD_TEXT_STYLE_VALUES: + bold = (bold_marker in tag_emphasis + and bold_marker not in parent_emphasis) + if bold: + break + + italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis fixed = google_fixed_width_font(tag_style) and not \ google_fixed_width_font(parent_style) and not self.pre @@ -282,6 +292,7 @@ self.quiet -= 1 def handle_tag(self, tag, attrs, start): + self.current_tag = tag # attrs is None for endtags if attrs is None: attrs = {} @@ -292,10 +303,11 @@ if self.tag_callback(self, tag, attrs, start) is True: return - # first thing inside the anchor tag is another tag that produces some output - if (start and not self.maybe_automatic_link is None - and tag not in ['p', 'div', 'style', 'dl', 'dt'] - and (tag != "img" or self.ignore_images)): + # first thing inside the anchor tag is another tag + # that produces some output + if (start and self.maybe_automatic_link is not None and + tag not in ['p', 'div', 'style', 'dl', 'dt'] and + (tag != "img" or self.ignore_images)): self.o("[") self.maybe_automatic_link = None self.empty_link = False @@ -312,7 +324,8 @@ tag_style = element_style(attrs, self.style_def, parent_style) self.tag_stack.append((tag, attrs, tag_style)) else: - dummy, attrs, tag_style = self.tag_stack.pop() if self.tag_stack else (None, {}, {}) + dummy, attrs, tag_style = self.tag_stack.pop() \ + if self.tag_stack else (None, {}, {}) if self.tag_stack: parent_style = self.tag_stack[-1][2] @@ -331,6 +344,8 @@ self.p() else: self.soft_br() + elif self.astack and tag == 'div': + pass else: self.p() @@ -370,24 +385,49 @@ self.blockquote -= 1 self.p() + def no_preceding_space(self): + return (self.preceding_data + and re.match(r'[^\s]', self.preceding_data[-1])) + if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: - self.o(self.emphasis_mark) + if start and no_preceding_space(self): + emphasis = ' ' + self.emphasis_mark + else: + emphasis = self.emphasis_mark + + self.o(emphasis) + if start: + self.stressed = True + if tag in ['strong', 'b'] and not self.ignore_emphasis: - self.o(self.strong_mark) - if tag in ['del', 'strike', 's']: + if start and no_preceding_space(self): + strong = ' ' + self.strong_mark + else: + strong = self.strong_mark + + self.o(strong) if start: - self.o('~~') + self.stressed = True + + if tag in ['del', 'strike', 's']: + if start and no_preceding_space(self): + strike = ' ~~' else: - self.o('~~') + strike = '~~' + + self.o(strike) + if start: + self.stressed = True if self.google_doc: if not self.inheader: # handle some font attributes, but leave headers clean self.handle_emphasis(start, tag_style, parent_style) - if tag in ["code", "tt"] and not self.pre: + if tag in ["kbd", "code", "tt"] and not self.pre: self.o('`') # TODO: `` `this` `` self.code = not self.code + if tag == "abbr": if start: self.abbr_title = None @@ -400,17 +440,30 @@ self.abbr_title = None self.abbr_data = '' + if tag == "q": + if not self.quote: + self.o(self.open_quote) + else: + self.o(self.close_quote) + self.quote = not self.quote + + def link_url(self, link, title=""): + url = urlparse.urljoin(self.baseurl, link) + title = ' "{0}"'.format(title) if title.strip() else '' + self.o(']({url}{title})'.format(url=escape_md(url), + title=title)) + if tag == "a" and not self.ignore_links: if start: - if ('href' in attrs) and \ - (attrs['href'] is not None) and \ - not (self.skip_internal_links and - attrs['href'].startswith('#')): + if 'href' in attrs and \ + attrs['href'] is not None and not \ + (self.skip_internal_links and + attrs['href'].startswith('#')): self.astack.append(attrs) self.maybe_automatic_link = attrs['href'] self.empty_link = True if self.protect_links: - attrs['href'] = '<'+attrs['href']+'>' + attrs['href'] = '<' + attrs['href'] + '>' else: self.astack.append(None) else: @@ -425,12 +478,12 @@ self.maybe_automatic_link = None if self.inline_links: try: - title = escape_md(a['title']) + title = a['title'] if a['title'] else '' + title = escape_md(title) except KeyError: - self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ")") + link_url(self, a['href'], '') else: - self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) - + ' "' + title + '" )') + link_url(self, a['href'], title) else: i = self.previousIndex(a) if i is not None: @@ -463,7 +516,7 @@ return # If we have a link to create, output the start - if not self.maybe_automatic_link is None: + if self.maybe_automatic_link is not None: href = self.maybe_automatic_link if self.images_to_alt and escape_md(alt) == href and \ self.absolute_url_matcher.match(href): @@ -483,7 +536,16 @@ self.o("![" + escape_md(alt) + "]") if self.inline_links: href = attrs.get('href') or '' - self.o("(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")") + self.o( + "(" + + escape_md( + urlparse.urljoin( + self.baseurl, + href + ) + ) + + ")" + ) else: i = self.previousIndex(attrs) if i is not None: @@ -576,11 +638,11 @@ if start: self.table_start = True if self.pad_tables: - self.o("<"+config.TABLE_MARKER_FOR_PAD+">") + self.o("<" + config.TABLE_MARKER_FOR_PAD + ">") self.o(" \n") else: if self.pad_tables: - self.o("</"+config.TABLE_MARKER_FOR_PAD+">") + self.o("</" + config.TABLE_MARKER_FOR_PAD + ">") self.o(" \n") if tag in ["td", "th"] and start: if self.split_next_td: @@ -654,8 +716,9 @@ return if self.startpre: - #self.out(" :") #TODO: not output when already one there - if not data.startswith("\n"): # <pre>stuff... + # self.out(" :") #TODO: not output when already one there + if not data.startswith("\n") and not data.startswith("\r\n"): + # <pre>stuff... data = "\n" + data if self.mark_code: self.out("\n[code]") @@ -668,7 +731,7 @@ if self.pre: if not self.list: bq += " " - #else: list content is already partially indented + # else: list content is already partially indented for i in range(len(self.list)): bq += " " data = data.replace("\n", "\n" + bq) @@ -700,8 +763,8 @@ self.out(' ') self.space = 0 - if self.a and ((self.p_p == 2 and self.links_each_paragraph) - or force == "end"): + if self.a and ((self.p_p == 2 and self.links_each_paragraph) or + force == "end"): if force == "end": self.out("\n") @@ -731,13 +794,25 @@ self.outcount += 1 def handle_data(self, data, entity_char=False): + if self.stressed: + data = data.strip() + self.stressed = False + self.preceding_stressed = True + elif (self.preceding_stressed + and re.match(r'[^\s.!?]', data[0]) + and not hn(self.current_tag) + and self.current_tag not in ['a', 'code', 'pre']): + # should match a letter or common punctuation + data = ' ' + data + self.preceding_stressed = False + if self.style: self.style_def.update(dumb_css_parser(data)) - if not self.maybe_automatic_link is None: + if self.maybe_automatic_link is not None: href = self.maybe_automatic_link - if (href == data and self.absolute_url_matcher.match(href) - and self.use_automatic_links): + if (href == data and self.absolute_url_matcher.match(href) and + self.use_automatic_links): self.o("<" + data + ">") self.empty_link = False return @@ -748,6 +823,7 @@ if not self.code and not self.pre and not entity_char: data = escape_md_section(data, snob=self.escape_snob) + self.preceding_data = data self.o(data, 1) def unknown_decl(self, data): # pragma: no cover @@ -764,10 +840,7 @@ return unifiable_n[c] else: try: - try: - return unichr(c) - except NameError: # Python3 - return chr(c) + return chr(c) except ValueError: # invalid unicode return '' @@ -783,10 +856,7 @@ if c == 'nbsp': return config.UNIFIABLE[c] else: - try: - return unichr(name2cp(c)) - except NameError: # Python3 - return chr(name2cp(c)) + return chr(name2cp(c)) def replaceEntities(self, s): s = s.group(1) @@ -809,7 +879,7 @@ nest_count = 0 if 'margin-left' in style: nest_count = int(style['margin-left'][:-2]) \ - // self.google_list_indent + // self.google_list_indent return nest_count diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/html2text/cli.py new/html2text-2018.1.9/html2text/cli.py --- old/html2text-2016.9.19/html2text/cli.py 2016-09-18 23:51:18.000000000 +0200 +++ new/html2text-2018.1.9/html2text/cli.py 2018-01-09 05:43:43.000000000 +0100 @@ -158,7 +158,8 @@ action="store_true", dest="ignore_tables", default=config.IGNORE_TABLES, - help="Ignore table-related tags (table, th, td, tr) while keeping rows." + help="Ignore table-related tags (table, th, td, tr) " + "while keeping rows." ) p.add_option( "--single-line-break", @@ -211,7 +212,24 @@ action="store", type="string", default=config.DECODE_ERRORS, - help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values" + help="What to do in case of decode errors.'ignore', 'strict' and " + "'replace' are acceptable values" + ) + p.add_option( + "--open-quote", + dest="open_quote", + action="store", + type="str", + default=config.OPEN_QUOTE, + help="The character used to open quotes", + ) + p.add_option( + "--close-quote", + dest="close_quote", + action="store", + type="str", + default=config.CLOSE_QUOTE, + help="The character used to close quotes", ) (options, args) = p.parse_args() @@ -226,8 +244,11 @@ file_ = args[0] if file_.startswith('http://') or file_.startswith('https://'): - warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)", - DeprecationWarning) + warnings.warn( + "Support for retrieving html over network is set for " + "deprecation by version (2017, 1, x)", + DeprecationWarning + ) baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() @@ -235,7 +256,8 @@ try: from feedparser import _getCharacterEncoding as enc except ImportError: - enc = lambda x, y: ('utf-8', 1) + def enc(x, y): + return ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' @@ -245,7 +267,8 @@ try: from chardet import detect except ImportError: - detect = lambda x: {'encoding': 'utf-8'} + def detect(x): + return {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = wrap_read() @@ -295,5 +318,7 @@ h.wrap_links = options.wrap_links h.pad_tables = options.pad_tables h.default_image_alt = options.default_image_alt + h.open_quote = options.open_quote + h.close_quote = options.close_quote wrapwrite(h.handle(data)) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/html2text/compat.py new/html2text-2018.1.9/html2text/compat.py --- old/html2text-2016.9.19/html2text/compat.py 2016-09-18 23:51:18.000000000 +0200 +++ new/html2text-2018.1.9/html2text/compat.py 2017-10-04 08:29:40.000000000 +0200 @@ -13,5 +13,9 @@ import html.parser as HTMLParser import urllib.request as urllib from html import escape + def html_escape(s): return escape(s, quote=False) + + +__all__ = ['HTMLParser', 'html_escape', 'htmlentitydefs', 'urllib', 'urlparse'] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/html2text/config.py new/html2text-2018.1.9/html2text/config.py --- old/html2text-2016.9.19/html2text/config.py 2016-09-18 23:51:18.000000000 +0200 +++ new/html2text-2018.1.9/html2text/config.py 2018-01-09 05:43:43.000000000 +0100 @@ -1,6 +1,8 @@ +from __future__ import unicode_literals + import re -# Use Unicode characters instead of their ascii psuedo-replacements +# Use Unicode characters instead of their ascii pseudo-replacements UNICODE_SNOB = 0 # Marker to use for marking tables for padding post processing @@ -31,6 +33,9 @@ # Number of pixels Google indents nested lists GOOGLE_LIST_INDENT = 36 +# Values Google and others may use to indicate bold text +BOLD_TEXT_STYLE_VALUES = ('bold', '700', '800', '900') + IGNORE_ANCHORS = False IGNORE_IMAGES = False IMAGES_TO_ALT = False @@ -41,7 +46,8 @@ DEFAULT_IMAGE_ALT = '' PAD_TABLES = False -# Convert links with same href and text to <href> format if they are absolute links +# Convert links with same href and text to <href> format +# if they are absolute links USE_AUTOMATIC_LINKS = True # For checking space-only lines on line 771 @@ -52,7 +58,10 @@ RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s') RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])") RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])") -RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") # to find links in the text + +# to find links in the text +RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") + RE_MD_DOT_MATCHER = re.compile(r""" ^ # start of line (\s*\d+) # optional whitespace and a number @@ -126,6 +135,11 @@ IGNORE_TABLES = False -# Use a single line break after a block element rather an two line breaks. +# Use a single line break after a block element rather than two line breaks. # NOTE: Requires body width setting to be 0. SINGLE_LINE_BREAK = False + + +# Use double quotation marks when converting the <q> tag. +OPEN_QUOTE = '"' +CLOSE_QUOTE = '"' diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/html2text/utils.py new/html2text-2018.1.9/html2text/utils.py --- old/html2text-2016.9.19/html2text/utils.py 2016-09-18 23:51:18.000000000 +0200 +++ new/html2text-2018.1.9/html2text/utils.py 2017-10-04 08:29:40.000000000 +0200 @@ -12,7 +12,6 @@ unifiable_n = {} - for k in config.UNIFIABLE.keys(): unifiable_n[name2cp(k)] = config.UNIFIABLE[k] @@ -191,7 +190,7 @@ # I'm not sure what this is for; I thought it was to detect lists, # but there's a <br>-inside-<span> case in one of the tests that # also depends upon it. - if stripped[0:1] == '-' or stripped[0:1] == '*': + if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**': return True # If the text begins with a single -, *, or +, followed by a space, @@ -245,6 +244,7 @@ return text + def reformat_table(lines, right_margin): """ Given the lines of a table @@ -252,11 +252,24 @@ """ # find the maximum width of the columns max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] + max_cols = len(max_width) for line in lines: cols = [x.rstrip() for x in line.split('|')] + num_cols = len(cols) + + # don't drop any data if colspan attributes result in unequal lengths + if num_cols < max_cols: + cols += [''] * (max_cols - num_cols) + elif max_cols < num_cols: + max_width += [ + len(x) + right_margin for x in + cols[-(num_cols - max_cols):] + ] + max_cols = num_cols + max_width = [max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)] - + # reformat new_lines = [] for line in lines: @@ -272,15 +285,16 @@ new_lines.append('|'.join(new_cols)) return new_lines + def pad_tables_in_text(text, right_margin=1): """ Provide padding for tables in the text """ lines = text.split('\n') - table_buffer, altered_lines, table_widths, table_started = [], [], [], False + table_buffer, table_started = [], False new_lines = [] for line in lines: - # Toogle table started + # Toggle table started if (config.TABLE_MARKER_FOR_PAD in line): table_started = not table_started if not table_started: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/html2text.egg-info/PKG-INFO new/html2text-2018.1.9/html2text.egg-info/PKG-INFO --- old/html2text-2016.9.19/html2text.egg-info/PKG-INFO 2016-09-19 00:08:46.000000000 +0200 +++ new/html2text-2018.1.9/html2text.egg-info/PKG-INFO 2018-01-10 07:03:39.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: html2text -Version: 2016.9.19 +Version: 2018.1.9 Summary: Turn HTML into equivalent Markdown-structured text. Home-page: https://github.com/Alir3z4/html2text/ Author: Alireza Savand @@ -141,3 +141,4 @@ Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/html2text.egg-info/SOURCES.txt new/html2text-2018.1.9/html2text.egg-info/SOURCES.txt --- old/html2text-2016.9.19/html2text.egg-info/SOURCES.txt 2016-09-19 00:08:46.000000000 +0200 +++ new/html2text-2018.1.9/html2text.egg-info/SOURCES.txt 2018-01-10 07:03:39.000000000 +0100 @@ -35,6 +35,8 @@ test/bodywidth_newline.md test/bold_inside_link.html test/bold_inside_link.md +test/bold_long_line.html +test/bold_long_line.md test/break_preserved_in_blockquote.html test/break_preserved_in_blockquote.md test/css_import_no_semicolon.html @@ -49,8 +51,12 @@ test/doc_with_table_bypass.md test/emdash-para.html test/emdash-para.md +test/emphasis_preserved_whitespace.html +test/emphasis_preserved_whitespace.md test/empty-link.html test/empty-link.md +test/empty-title-tag.html +test/empty-title-tag.md test/flip_emphasis.html test/flip_emphasis.md test/google-like_font-properties.html @@ -65,6 +71,8 @@ test/html_entities_out_of_text.md test/images_to_alt.html test/images_to_alt.md +test/images_with_div_wrap.html +test/images_with_div_wrap.md test/images_with_size.html test/images_with_size.md test/img-tag-with-link.html @@ -75,6 +83,8 @@ test/invalid_start.md test/invalid_unicode.html test/invalid_unicode.md +test/kbd_tag.html +test/kbd_tag.md test/link_titles.html test/link_titles.md test/list_tags_example.html @@ -109,6 +119,8 @@ test/preformatted_in_list.md test/protect_links.html test/protect_links.md +test/q_tag.html +test/q_tag.md test/single_line_break.html test/single_line_break.md test/table_ignore.html diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/setup.py new/html2text-2018.1.9/setup.py --- old/html2text-2016.9.19/setup.py 2016-05-29 18:13:44.000000000 +0200 +++ new/html2text-2018.1.9/setup.py 2017-10-04 08:31:57.000000000 +0200 @@ -3,11 +3,20 @@ from setuptools import setup, Command, find_packages + +def read_md_convert(f): + return convert(f, 'rst') + + +def read_md_open(f): + return open(f, 'r').read() + + try: from pypandoc import convert - read_md = lambda f: convert(f, 'rst') + read_md = read_md_convert except ImportError: - read_md = lambda f: open(f, 'r').read() + read_md = read_md_open requires_list = [] try: @@ -69,6 +78,7 @@ 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', ], entry_points=""" [console_scripts] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/bold_long_line.html new/html2text-2018.1.9/test/bold_long_line.html --- old/html2text-2016.9.19/test/bold_long_line.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/bold_long_line.html 2017-10-04 08:29:40.000000000 +0200 @@ -0,0 +1,3 @@ +<p> +<b>text</b> and a very long long long long long long long long long long long long long long long long long long long long line +</p> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/bold_long_line.md new/html2text-2018.1.9/test/bold_long_line.md --- old/html2text-2016.9.19/test/bold_long_line.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/bold_long_line.md 2017-10-04 08:29:40.000000000 +0200 @@ -0,0 +1,3 @@ +**text** and a very long long long long long long long long long long long +long long long long long long long long long line + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/emphasis_preserved_whitespace.html new/html2text-2018.1.9/test/emphasis_preserved_whitespace.html --- old/html2text-2016.9.19/test/emphasis_preserved_whitespace.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/emphasis_preserved_whitespace.html 2017-10-04 08:29:40.000000000 +0200 @@ -0,0 +1,20 @@ +<p><em> emphasis </em></p> +<p><em>emphasis: </em>some text</p> +<p><em>repeat: </em>again</p> + +<p><b> bold </b></p> +<p><b>bold: </b>some text</p> +<p><b>repeat: </b>again</p> + +<p><strike> strike </strike></p> +<p><strike>strike: </strike>some text</p> +<p><strike>strike: </strike>again</p> + +<p>separate<em> emphasis</em> some more text</p> + +<!-- Various punctuation has no space --> +<p><em>emphasis</em>.</p> +<p><em>emphasis</em>?</p> +<p><em>emphasis</em>!</p> + +<p><em>em1</em><em>em2</em></p> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/emphasis_preserved_whitespace.md new/html2text-2018.1.9/test/emphasis_preserved_whitespace.md --- old/html2text-2016.9.19/test/emphasis_preserved_whitespace.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/emphasis_preserved_whitespace.md 2017-10-04 08:29:40.000000000 +0200 @@ -0,0 +1,28 @@ +_emphasis_ + +_emphasis:_ some text + +_repeat:_ again + +**bold** + +**bold:** some text + +**repeat:** again + +~~strike~~ + +~~strike:~~ some text + +~~strike:~~ again + +separate _emphasis_ some more text + +_emphasis_. + +_emphasis_? + +_emphasis_! + +_em1_ _em2_ + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/empty-link.html new/html2text-2018.1.9/test/empty-link.html --- old/html2text-2016.9.19/test/empty-link.html 2016-05-29 18:08:43.000000000 +0200 +++ new/html2text-2018.1.9/test/empty-link.html 2017-10-04 08:29:40.000000000 +0200 @@ -1,6 +1,6 @@ <h1>Processing empty hyperlinks</h1> -<p>This test checks wheter empty hyperlinks still appear in the markdown result.</p> +<p>This test checks whether empty hyperlinks still appear in the markdown result.</p> <a href="http://some.link"></a> <a href="http://some.link"><p></p></a> \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/empty-link.md new/html2text-2018.1.9/test/empty-link.md --- old/html2text-2016.9.19/test/empty-link.md 2016-05-29 18:08:43.000000000 +0200 +++ new/html2text-2018.1.9/test/empty-link.md 2017-10-04 08:29:40.000000000 +0200 @@ -1,6 +1,6 @@ # Processing empty hyperlinks -This test checks wheter empty hyperlinks still appear in the markdown result. +This test checks whether empty hyperlinks still appear in the markdown result. [](http://some.link) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/empty-title-tag.html new/html2text-2018.1.9/test/empty-title-tag.html --- old/html2text-2016.9.19/test/empty-title-tag.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/empty-title-tag.html 2017-10-04 08:29:40.000000000 +0200 @@ -0,0 +1 @@ +<a href="test.html" title>This is an A tag with an empty title property</a> \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/empty-title-tag.md new/html2text-2018.1.9/test/empty-title-tag.md --- old/html2text-2016.9.19/test/empty-title-tag.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/empty-title-tag.md 2017-10-04 08:29:40.000000000 +0200 @@ -0,0 +1,2 @@ +[This is an A tag with an empty title property](test.html) + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/google-like_font-properties.html new/html2text-2018.1.9/test/google-like_font-properties.html --- old/html2text-2016.9.19/test/google-like_font-properties.html 2016-09-18 23:51:18.000000000 +0200 +++ new/html2text-2018.1.9/test/google-like_font-properties.html 2017-10-04 08:29:40.000000000 +0200 @@ -5,6 +5,12 @@ <BODY> <p><span style="font-weight: bold">font-weight: bold</span></p> <P><SPAN STYLE="FONT-WEIGHT: BOLD">FONT-WEIGHT: BOLD</SPAN></P> + <P><SPAN STYLE="font-weight: 700">font-weight: 700</SPAN></P> + <P><SPAN STYLE="FONT-WEIGHT: 700">FONT-WEIGHT: 700</SPAN></P> + <P><SPAN STYLE="font-weight: 800">font-weight: 800</SPAN></P> + <P><SPAN STYLE="FONT-WEIGHT: 800">FONT-WEIGHT: 800</SPAN></P> + <P><SPAN STYLE="font-weight: 900">font-weight: 900</SPAN></P> + <P><SPAN STYLE="FONT-WEIGHT: 900">FONT-WEIGHT: 900</SPAN></P> <p><span style="font-style: italic">font-style: italic</span></p> <P><SPAN STYLE="FONT-STYLE: ITALIC">FONT-STYLE: ITALIC</SPAN></P> <p><span style="font-weight: bold;font-style: italic"> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/google-like_font-properties.md new/html2text-2018.1.9/test/google-like_font-properties.md --- old/html2text-2016.9.19/test/google-like_font-properties.md 2016-09-18 23:51:18.000000000 +0200 +++ new/html2text-2018.1.9/test/google-like_font-properties.md 2017-10-04 08:29:40.000000000 +0200 @@ -1,5 +1,11 @@ **font-weight: bold** **FONT-WEIGHT: BOLD** +**font-weight: 700** +**FONT-WEIGHT: 700** +**font-weight: 800** +**FONT-WEIGHT: 800** +**font-weight: 900** +**FONT-WEIGHT: 900** _font-style: italic_ _FONT-STYLE: ITALIC_ _**font-weight: bold;font-style: italic**_ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/html-escaping.html new/html2text-2018.1.9/test/html-escaping.html --- old/html2text-2016.9.19/test/html-escaping.html 2016-05-29 18:08:43.000000000 +0200 +++ new/html2text-2018.1.9/test/html-escaping.html 2017-10-04 08:29:40.000000000 +0200 @@ -1,3 +1,3 @@ -<p>Escaped HTML like <div> or & should remain escaped on output</p> -<pre>...unless that escaped HTML is in a <pre> tag</pre> +<p>Escaped HTML like <div> or & should NOT remain escaped on output</p> +<pre>...even when that escaped HTML is in a <pre> tag</pre> <code>...or a <code> tag</code> \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/html-escaping.md new/html2text-2018.1.9/test/html-escaping.md --- old/html2text-2016.9.19/test/html-escaping.md 2016-05-29 18:08:43.000000000 +0200 +++ new/html2text-2018.1.9/test/html-escaping.md 2017-10-04 08:29:40.000000000 +0200 @@ -1,8 +1,8 @@ -Escaped HTML like <div> or & should remain escaped on output +Escaped HTML like <div> or & should NOT remain escaped on output - ...unless that escaped HTML is in a <pre> tag + ...even when that escaped HTML is in a <pre> tag `...or a <code> tag` diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/images_with_div_wrap.html new/html2text-2018.1.9/test/images_with_div_wrap.html --- old/html2text-2016.9.19/test/images_with_div_wrap.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/images_with_div_wrap.html 2017-10-04 08:29:40.000000000 +0200 @@ -0,0 +1 @@ +<a href="http://example.com"><div><img src="http://example.com/img.png"/></div></a> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/images_with_div_wrap.md new/html2text-2018.1.9/test/images_with_div_wrap.md --- old/html2text-2016.9.19/test/images_with_div_wrap.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/images_with_div_wrap.md 2017-10-04 08:29:40.000000000 +0200 @@ -0,0 +1,2 @@ +[![](http://example.com/img.png)](http://example.com) + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/kbd_tag.html new/html2text-2018.1.9/test/kbd_tag.html --- old/html2text-2016.9.19/test/kbd_tag.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/kbd_tag.html 2018-01-09 05:43:43.000000000 +0100 @@ -0,0 +1 @@ +Press <kbd>[CTRL]+c</kbd> to copy. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/kbd_tag.md new/html2text-2018.1.9/test/kbd_tag.md --- old/html2text-2016.9.19/test/kbd_tag.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/kbd_tag.md 2018-01-09 05:43:43.000000000 +0100 @@ -0,0 +1,2 @@ +Press `[CTRL]+c` to copy. + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/link_titles.md new/html2text-2018.1.9/test/link_titles.md --- old/html2text-2016.9.19/test/link_titles.md 2016-05-29 18:08:43.000000000 +0200 +++ new/html2text-2018.1.9/test/link_titles.md 2017-10-04 08:29:40.000000000 +0200 @@ -1,3 +1,3 @@ -[ first example](http://example.com "MyTitle" ) +[ first example](http://example.com "MyTitle") [ second example](http://example.com) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/list_tags_example.html new/html2text-2018.1.9/test/list_tags_example.html --- old/html2text-2016.9.19/test/list_tags_example.html 2016-05-29 18:08:43.000000000 +0200 +++ new/html2text-2018.1.9/test/list_tags_example.html 2017-10-04 08:29:40.000000000 +0200 @@ -34,6 +34,6 @@ </ol> <ul style="list-style-type:ordered;"> -<li>somthing else here</li> +<li>something else here</li> <li>some item</li> </ul> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/list_tags_example.md new/html2text-2018.1.9/test/list_tags_example.md --- old/html2text-2016.9.19/test/list_tags_example.md 2016-05-29 18:08:48.000000000 +0200 +++ new/html2text-2018.1.9/test/list_tags_example.md 2017-10-04 08:29:40.000000000 +0200 @@ -33,6 +33,6 @@ 2. some item 3. some item - * somthing else here + * something else here * some item diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/pad_table.html new/html2text-2018.1.9/test/pad_table.html --- old/html2text-2016.9.19/test/pad_table.html 2016-05-29 18:08:48.000000000 +0200 +++ new/html2text-2018.1.9/test/pad_table.html 2017-10-04 08:29:40.000000000 +0200 @@ -22,5 +22,30 @@ <tr> <td>Content 1</td> <td>Content 2 longer</td> <td><img src="http://lorempixel.com/200/200" alt="200"/> Image!</td> </tr> </table> -something else entirely +something else entirely<br> + + <table> + <thead> + <tr><th>One</th><th>Two</th><th>Three</th></tr> + </thead> + <tbody> + <tr><td>A</td><td>B</td><th>C</th></tr> + <tr><td>A</td><td colspan="2">B+C</td></tr> + <tr><td colspan="2">A+B</td><td>C</td></tr> + <tr><td colspan="3">A+B+C</td></tr> + </tbody> + </table> + + <table> + <thead> + <tr><th colspan="2">One+Two</th><th>Three</th></tr> + </thead> + <tbody> + <tr><td>A</td><td>B</td><th>C</th></tr> + <tr><td>A</td><td colspan="2">B+C</td></tr> + <tr><td colspan="2">A+B</td><td>C</td></tr> + <tr><td colspan="3">A+B+C</td></tr> + </tbody> + </table> + </body> </html> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/pad_table.md new/html2text-2018.1.9/test/pad_table.md --- old/html2text-2016.9.19/test/pad_table.md 2016-05-29 18:08:48.000000000 +0200 +++ new/html2text-2018.1.9/test/pad_table.md 2017-10-04 08:29:40.000000000 +0200 @@ -24,5 +24,19 @@ Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image! Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image! -something else entirely +something else entirely +One | Two | Three +------|-----|------- +A | B | C +A | B+C +A+B | C +A+B+C + +One+Two | Three +--------|------- +A | B | C +A | B+C +A+B | C +A+B+C + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/pre.html new/html2text-2018.1.9/test/pre.html --- old/html2text-2016.9.19/test/pre.html 2016-05-29 18:08:43.000000000 +0200 +++ new/html2text-2018.1.9/test/pre.html 2017-10-04 08:29:40.000000000 +0200 @@ -1,6 +1,6 @@ <html> <head> - <title>initial crowsed pre handling test #1</title> + <title>initial crowded pre handling test #1</title> </head> <body> <pre>a diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/q_tag.html new/html2text-2018.1.9/test/q_tag.html --- old/html2text-2016.9.19/test/q_tag.html 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/q_tag.html 2018-01-09 05:43:43.000000000 +0100 @@ -0,0 +1 @@ +<q>If this is a test,</q> he said, <q>then it should pass</q>. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/q_tag.md new/html2text-2018.1.9/test/q_tag.md --- old/html2text-2016.9.19/test/q_tag.md 1970-01-01 01:00:00.000000000 +0100 +++ new/html2text-2018.1.9/test/q_tag.md 2018-01-09 05:43:43.000000000 +0100 @@ -0,0 +1,2 @@ +"If this is a test," he said, "then it should pass". + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/test_html2text.py new/html2text-2018.1.9/test/test_html2text.py --- old/html2text-2016.9.19/test/test_html2text.py 2016-09-18 23:51:18.000000000 +0200 +++ new/html2text-2018.1.9/test/test_html2text.py 2017-10-04 08:29:40.000000000 +0200 @@ -1,5 +1,7 @@ import codecs import glob +import html2text +import logging import os import re import subprocess @@ -9,12 +11,20 @@ import unittest2 as unittest else: import unittest -import logging + logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.DEBUG) -import html2text + +def cleanup_eol(clean_str): + if os.name == 'nt' or sys.platform == 'cygwin': + # Fix the unwanted CR to CRCRLF replacement + # during text pipelining on Windows/cygwin + # on cygwin, os.name == 'posix', not nt + clean_str = re.sub(r'\r+', '\r', clean_str) + clean_str = clean_str.replace('\r\n', '\n') + return clean_str def test_module(fn, google_doc=False, **kwargs): @@ -31,9 +41,9 @@ setattr(h, k, v) result = get_baseline(fn) - inf = open(fn) - actual = h.handle(inf.read()) - inf.close() + with open(fn) as inf: + actual = cleanup_eol(inf.read()) + actual = h.handle(actual) return result, actual @@ -56,11 +66,7 @@ actual = out.decode('utf8') - if os.name == 'nt': - # Fix the unwanted CR to CRCRLF replacement - # during text pipelining on Windows/cygwin - actual = re.sub(r'\r+', '\r', actual) - actual = actual.replace('\r\n', '\n') + actual = cleanup_eol(actual) return result, actual @@ -82,30 +88,62 @@ def get_baseline(fn): name = get_baseline_name(fn) - f = codecs.open(name, mode='r', encoding='utf8') - out = f.read() - f.close() + with codecs.open(name, mode='r', encoding='utf8') as f: + out = f.read() + out = cleanup_eol(out) return out class TestHTML2Text(unittest.TestCase): - pass + + def test_html_escape(self): + self.assertEqual( + html2text.compat.html_escape('<pre>and then<div> & other tags'), + '<pre>and then<div> & other tags' + ) + + def test_unescape(self): + self.assertEqual( + '<pre>and then<div> & other tags', + html2text.unescape( + '<pre>and then<div> & other tags' + ) + ) + + def _skip_certain_tags(self, h2t, tag, attrs, start): + if tag == 'b': + return True + + def test_tag_callback(self): + h = html2text.HTML2Text() + h.tag_callback = self._skip_certain_tags + ret = h.handle( + 'this is a <b>txt</b> and this is a' + ' <b class="skip">with text</b> and ' + 'some <i>italics</i> too.' + ) + self.assertEqual( + ret, + 'this is a txt and this is a' + ' with text and ' + 'some _italics_ too.\n\n' + ) def generate_test(fn): - def test_mod(self): + def _test_mod(self): self.maxDiff = None result, actual = test_module(fn, **module_args) self.assertEqual(result, actual) - def test_cmd(self): + def _test_cmd(self): # Because there is no command-line option to control unicode_snob if 'unicode_snob' not in module_args: self.maxDiff = None result, actual = test_command(fn, *cmdline_args) self.assertEqual(result, actual) - def test_func(self): + def _test_func(self): result, actual = test_function(fn, **func_args) self.assertEqual(result, actual) @@ -184,14 +222,19 @@ if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']: test_func = None + else: + test_func = _test_func if base_fn == 'inplace_baseurl_substitution.html': module_args['baseurl'] = 'http://brettterpstra.com' module_args['body_width'] = 0 # there is no way to specify baseurl in cli :( test_cmd = None + else: + test_cmd = _test_cmd + + return _test_mod, test_cmd, test_func - return test_mod, test_cmd, test_func # Originally from http://stackoverflow.com/questions/32899/\ # how-to-generate-dynamic-parametrized-unit-tests-in-python diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/html2text-2016.9.19/test/test_memleak.py new/html2text-2018.1.9/test/test_memleak.py --- old/html2text-2016.9.19/test/test_memleak.py 2016-05-29 18:08:43.000000000 +0200 +++ new/html2text-2018.1.9/test/test_memleak.py 2017-10-04 08:29:40.000000000 +0200 @@ -1,14 +1,15 @@ +import html2text +import logging import sys if sys.version_info[:2] < (2, 7): import unittest2 as unittest else: import unittest -import logging + + logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', level=logging.DEBUG) -import html2text - class TestMemleak(unittest.TestCase): """ ++++++ remove_unittest2.patch ++++++ --- a/setup.py +++ b/setup.py @@ -18,14 +18,7 @@ try: except ImportError: read_md = read_md_open -requires_list = [] -try: - import unittest2 as unittest -except ImportError: - import unittest -else: - if sys.version_info <= (2, 6): - requires_list.append("unittest2") +import unittest class RunTests(Command): @@ -68,9 +61,6 @@ setup( 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.4', - 'Programming Language :: Python :: 2.5', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.0', @@ -85,7 +75,6 @@ setup( html2text=html2text.cli:main """, license='GNU GPL 3', - requires=requires_list, packages=find_packages(exclude=['test']), include_package_data=True, zip_safe=False, --- a/test/test_html2text.py +++ b/test/test_html2text.py @@ -7,10 +7,7 @@ import re import subprocess import sys -if sys.version_info[:2] < (2, 7): - import unittest2 as unittest -else: - import unittest +import unittest logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s', --- a/test/test_memleak.py +++ b/test/test_memleak.py @@ -1,10 +1,7 @@ import html2text import logging import sys -if sys.version_info[:2] < (2, 7): - import unittest2 as unittest -else: - import unittest +import unittest logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',