Hello community, here is the log from the commit of package python-w3lib for openSUSE:Factory checked in at 2019-09-04 09:10:05 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-w3lib (Old) and /work/SRC/openSUSE:Factory/.python-w3lib.new.7948 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-w3lib" Wed Sep 4 09:10:05 2019 rev:5 rq:727099 version:1.21.0 Changes: -------- --- /work/SRC/openSUSE:Factory/python-w3lib/python-w3lib.changes 2019-03-29 20:43:30.906679078 +0100 +++ /work/SRC/openSUSE:Factory/.python-w3lib.new.7948/python-w3lib.changes 2019-09-04 09:10:08.478981525 +0200 @@ -1,0 +2,13 @@ +Thu Aug 29 13:15:56 UTC 2019 - Marketa Calabkova <[email protected]> + +- update to 1.21.1 + * Add the "encoding" and "path_encoding" parameters to + w3lib.url.safe_download_url (issue #118) + * w3lib.url.safe_url_string now also removes tabs and new lines + (issue #133) + * w3lib.html.remove_comments now also removes truncated comments + (issue #129) + * w3lib.html.remove_tags_with_content no longer removes tags which + start with the same text as one of the specified tags (issue #114) + +------------------------------------------------------------------- Old: ---- w3lib-1.20.0.tar.gz New: ---- w3lib-1.21.0.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-w3lib.spec ++++++ --- /var/tmp/diff_new_pack.s3qdWQ/_old 2019-09-04 09:10:10.030981309 +0200 +++ /var/tmp/diff_new_pack.s3qdWQ/_new 2019-09-04 09:10:10.062981305 +0200 @@ -18,7 +18,7 @@ %{?!python_module:%define python_module() python-%{**} python3-%{**}} Name: python-w3lib -Version: 1.20.0 +Version: 1.21.0 Release: 0 Summary: Library of Web-Related Functions License: BSD-3-Clause ++++++ w3lib-1.20.0.tar.gz -> w3lib-1.21.0.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/PKG-INFO new/w3lib-1.21.0/PKG-INFO --- old/w3lib-1.20.0/PKG-INFO 2019-01-11 15:01:52.000000000 +0100 +++ new/w3lib-1.21.0/PKG-INFO 2019-08-09 13:00:36.000000000 +0200 @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: w3lib -Version: 1.20.0 +Version: 1.21.0 Summary: Library of web-related functions Home-page: https://github.com/scrapy/w3lib Author: Scrapy project diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/docs/conf.py new/w3lib-1.21.0/docs/conf.py --- old/w3lib-1.20.0/docs/conf.py 2019-01-11 15:01:17.000000000 +0100 +++ new/w3lib-1.21.0/docs/conf.py 2019-08-09 13:00:00.000000000 +0200 @@ -53,7 +53,7 @@ # built documents. # # The full version, including alpha/beta/rc tags. -release = '1.20.0' +release = '1.21.0' # The short X.Y version. version = '.'.join(release.split('.')[:2]) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/docs/index.rst new/w3lib-1.21.0/docs/index.rst --- old/w3lib-1.20.0/docs/index.rst 2019-01-11 15:01:17.000000000 +0100 +++ new/w3lib-1.21.0/docs/index.rst 2019-08-09 13:00:00.000000000 +0200 @@ -39,7 +39,7 @@ Tests ===== -`nose`_ is the preferred way to run tests. Just run: ``nosetests`` from the +`pytest`_ is the preferred way to run tests. Just run: ``pytest`` from the root directory to execute tests using the default Python interpreter. `tox`_ could be used to run tests for all supported Python versions. @@ -48,7 +48,7 @@ Python interpreters. .. _tox: http://tox.testrun.org -.. _nose: http://readthedocs.org/docs/nose/en/latest/ +.. _pytest: https://docs.pytest.org/en/latest/ Changelog @@ -74,4 +74,3 @@ * :ref:`genindex` * :ref:`modindex` * :ref:`search` - diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/setup.py new/w3lib-1.21.0/setup.py --- old/w3lib-1.20.0/setup.py 2019-01-11 15:01:17.000000000 +0100 +++ new/w3lib-1.21.0/setup.py 2019-08-09 13:00:00.000000000 +0200 @@ -3,7 +3,7 @@ setup( name='w3lib', - version='1.20.0', + version='1.21.0', license='BSD', description='Library of web-related functions', author='Scrapy project', diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/tests/test_html.py new/w3lib-1.21.0/tests/test_html.py --- old/w3lib-1.20.0/tests/test_html.py 2019-01-11 15:01:17.000000000 +0100 +++ new/w3lib-1.21.0/tests/test_html.py 2019-08-09 13:00:00.000000000 +0200 @@ -106,6 +106,8 @@ self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"), u'test whatever') self.assertEqual(remove_comments(b"test <!--\ntextcoment\n--> whatever"), u'test whatever') + self.assertEqual(remove_comments(b"test <!--"), u'test ') + class RemoveTagsTest(unittest.TestCase): def test_returns_unicode(self): @@ -184,6 +186,10 @@ # text with empty tags self.assertEqual(remove_tags_with_content(u'<br/>a<br />', which_ones=('br',)), u'a') + def test_tags_with_shared_prefix(self): + # https://github.com/scrapy/w3lib/issues/114 + self.assertEqual(remove_tags_with_content(u'<span></span><s></s>', which_ones=('s',)), u'<span></span>') + class ReplaceEscapeCharsTest(unittest.TestCase): def test_returns_unicode(self): diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/tests/test_url.py new/w3lib-1.21.0/tests/test_url.py --- old/w3lib-1.20.0/tests/test_url.py 2019-01-11 15:01:17.000000000 +0100 +++ new/w3lib-1.21.0/tests/test_url.py 2019-08-09 13:00:00.000000000 +0200 @@ -59,6 +59,20 @@ self.assertTrue(isinstance(safe_url_string(b'http://example.com/'), str)) + def test_safe_url_string_remove_ascii_tab_and_newlines(self): + self.assertEqual(safe_url_string("http://example.com/test\n.html"), + "http://example.com/test.html") + self.assertEqual(safe_url_string("http://example.com/test\t.html"), + "http://example.com/test.html") + self.assertEqual(safe_url_string("http://example.com/test\r.html"), + "http://example.com/test.html") + self.assertEqual(safe_url_string("http://example.com/test\r.html\n"), + "http://example.com/test.html") + self.assertEqual(safe_url_string("http://example.com/test\r\n.html\t"), + "http://example.com/test.html") + self.assertEqual(safe_url_string("http://example.com/test\a\n.html"), + "http://example.com/test%07.html") + def test_safe_url_string_unsafe_chars(self): safeurl = safe_url_string(r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|") self.assertEqual(safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|") @@ -203,6 +217,19 @@ 'http://www.example.org/image') self.assertEqual(safe_download_url('http://www.example.org/dir/'), 'http://www.example.org/dir/') + self.assertEqual(safe_download_url(b'http://www.example.org/dir/'), + 'http://www.example.org/dir/') + + # Encoding related tests + self.assertEqual(safe_download_url(b'http://www.example.org?\xa3', + encoding='latin-1', path_encoding='latin-1'), + 'http://www.example.org/?%A3') + self.assertEqual(safe_download_url(b'http://www.example.org?\xc2\xa3', + encoding='utf-8', path_encoding='utf-8'), + 'http://www.example.org/?%C2%A3') + self.assertEqual(safe_download_url(b'http://www.example.org/\xc2\xa3?\xc2\xa3', + encoding='utf-8', path_encoding='latin-1'), + 'http://www.example.org/%A3?%C2%A3') def test_is_url(self): self.assertTrue(is_url('http://www.example.org')) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/w3lib/__init__.py new/w3lib-1.21.0/w3lib/__init__.py --- old/w3lib-1.20.0/w3lib/__init__.py 2019-01-11 15:01:17.000000000 +0100 +++ new/w3lib-1.21.0/w3lib/__init__.py 2019-08-09 13:00:00.000000000 +0200 @@ -1,3 +1,3 @@ -__version__ = "1.20.0" +__version__ = "1.21.0" version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split('.')) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/w3lib/html.py new/w3lib-1.21.0/w3lib/html.py --- old/w3lib-1.20.0/w3lib/html.py 2019-01-11 15:01:17.000000000 +0100 +++ new/w3lib-1.21.0/w3lib/html.py 2019-08-09 13:00:00.000000000 +0200 @@ -122,7 +122,7 @@ return _tag_re.sub(token, to_unicode(text, encoding)) -_REMOVECOMMENTS_RE = re.compile(u'<!--.*?-->', re.DOTALL) +_REMOVECOMMENTS_RE = re.compile(u'<!--.*?(?:-->|$)', re.DOTALL) def remove_comments(text, encoding=None): """ Remove HTML Comments. @@ -220,7 +220,7 @@ text = to_unicode(text, encoding) if which_ones: - tags = '|'.join([r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) + tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) text = retags.sub(u'', text) return text diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/w3lib/url.py new/w3lib-1.21.0/w3lib/url.py --- old/w3lib-1.20.0/w3lib/url.py 2019-01-11 15:01:17.000000000 +0100 +++ new/w3lib-1.21.0/w3lib/url.py 2019-08-09 13:00:00.000000000 +0200 @@ -34,9 +34,12 @@ _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%' +_ascii_tab_newline_re = re.compile(r'[\t\n\r]') # see https://infra.spec.whatwg.org/#ascii-tab-or-newline + def safe_url_string(url, encoding='utf8', path_encoding='utf8'): """Convert the given URL into a legal URL by escaping unsafe characters - according to RFC-3986. + according to RFC-3986. Also, ASCII tabs and newlines are removed + as per https://url.spec.whatwg.org/#url-parsing. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for @@ -56,8 +59,8 @@ # encoded with the supplied encoding (or UTF8 by default) # - if the supplied (or default) encoding chokes, # percent-encode offending bytes - parts = urlsplit(to_unicode(url, encoding=encoding, - errors='percentencode')) + decoded = to_unicode(url, encoding=encoding, errors='percentencode') + parts = urlsplit(_ascii_tab_newline_re.sub('', decoded)) # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) @@ -84,7 +87,7 @@ _parent_dirs = re.compile(r'/?(\.\./)+') -def safe_download_url(url): +def safe_download_url(url, encoding='utf8', path_encoding='utf8'): """ Make a url for download. This will call safe_url_string and then strip the fragment, if one exists. The path will be normalised. @@ -92,11 +95,11 @@ If the path is outside the document root, it will be changed to be within the document root. """ - safe_url = safe_url_string(url) + safe_url = safe_url_string(url, encoding, path_encoding) scheme, netloc, path, query, _ = urlsplit(safe_url) if path: path = _parent_dirs.sub('', posixpath.normpath(path)) - if url.endswith('/') and not path.endswith('/'): + if safe_url.endswith('/') and not path.endswith('/'): path += '/' else: path = '/' diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/w3lib-1.20.0/w3lib.egg-info/PKG-INFO new/w3lib-1.21.0/w3lib.egg-info/PKG-INFO --- old/w3lib-1.20.0/w3lib.egg-info/PKG-INFO 2019-01-11 15:01:52.000000000 +0100 +++ new/w3lib-1.21.0/w3lib.egg-info/PKG-INFO 2019-08-09 13:00:36.000000000 +0200 @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: w3lib -Version: 1.20.0 +Version: 1.21.0 Summary: Library of web-related functions Home-page: https://github.com/scrapy/w3lib Author: Scrapy project
