commit python-w3lib for openSUSE:Factory

root Wed, 04 Sep 2019 00:10:49 -0700

Hello community,

here is the log from the commit of package python-w3lib for openSUSE:Factory 
checked in at 2019-09-04 09:10:05
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-w3lib (Old)
 and      /work/SRC/openSUSE:Factory/.python-w3lib.new.7948 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-w3lib"

Wed Sep  4 09:10:05 2019 rev:5 rq:727099 version:1.21.0

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-w3lib/python-w3lib.changes        
2019-03-29 20:43:30.906679078 +0100
+++ /work/SRC/openSUSE:Factory/.python-w3lib.new.7948/python-w3lib.changes      
2019-09-04 09:10:08.478981525 +0200
@@ -1,0 +2,13 @@
+Thu Aug 29 13:15:56 UTC 2019 - Marketa Calabkova <[email protected]>
+
+- update to 1.21.1
+  * Add the "encoding" and "path_encoding" parameters to
+    w3lib.url.safe_download_url (issue #118)
+  * w3lib.url.safe_url_string now also removes tabs and new lines
+    (issue #133)
+  * w3lib.html.remove_comments now also removes truncated comments
+    (issue #129)
+  * w3lib.html.remove_tags_with_content no longer removes tags which
+    start with the same text as one of the specified tags (issue #114)
+
+-------------------------------------------------------------------

Old:
----
  w3lib-1.20.0.tar.gz

New:
----
  w3lib-1.21.0.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-w3lib.spec ++++++
--- /var/tmp/diff_new_pack.s3qdWQ/_old  2019-09-04 09:10:10.030981309 +0200
+++ /var/tmp/diff_new_pack.s3qdWQ/_new  2019-09-04 09:10:10.062981305 +0200
@@ -18,7 +18,7 @@
 
 %{?!python_module:%define python_module() python-%{**} python3-%{**}}
 Name:           python-w3lib
-Version:        1.20.0
+Version:        1.21.0
 Release:        0
 Summary:        Library of Web-Related Functions
 License:        BSD-3-Clause

++++++ w3lib-1.20.0.tar.gz -> w3lib-1.21.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/PKG-INFO new/w3lib-1.21.0/PKG-INFO
--- old/w3lib-1.20.0/PKG-INFO   2019-01-11 15:01:52.000000000 +0100
+++ new/w3lib-1.21.0/PKG-INFO   2019-08-09 13:00:36.000000000 +0200
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: w3lib
-Version: 1.20.0
+Version: 1.21.0
 Summary: Library of web-related functions
 Home-page: https://github.com/scrapy/w3lib
 Author: Scrapy project
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/docs/conf.py 
new/w3lib-1.21.0/docs/conf.py
--- old/w3lib-1.20.0/docs/conf.py       2019-01-11 15:01:17.000000000 +0100
+++ new/w3lib-1.21.0/docs/conf.py       2019-08-09 13:00:00.000000000 +0200
@@ -53,7 +53,7 @@
 # built documents.
 #
 # The full version, including alpha/beta/rc tags.
-release = '1.20.0'
+release = '1.21.0'
 # The short X.Y version.
 version = '.'.join(release.split('.')[:2])
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/docs/index.rst 
new/w3lib-1.21.0/docs/index.rst
--- old/w3lib-1.20.0/docs/index.rst     2019-01-11 15:01:17.000000000 +0100
+++ new/w3lib-1.21.0/docs/index.rst     2019-08-09 13:00:00.000000000 +0200
@@ -39,7 +39,7 @@
 Tests
 =====
 
-`nose`_ is the preferred way to run tests. Just run: ``nosetests`` from the
+`pytest`_ is the preferred way to run tests. Just run: ``pytest`` from the
 root directory to execute tests using the default Python interpreter.
 
 `tox`_ could be used to run tests for all supported Python versions.
@@ -48,7 +48,7 @@
 Python interpreters.
 
 .. _tox: http://tox.testrun.org
-.. _nose: http://readthedocs.org/docs/nose/en/latest/
+.. _pytest: https://docs.pytest.org/en/latest/
 
 
 Changelog
@@ -74,4 +74,3 @@
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
-
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/setup.py new/w3lib-1.21.0/setup.py
--- old/w3lib-1.20.0/setup.py   2019-01-11 15:01:17.000000000 +0100
+++ new/w3lib-1.21.0/setup.py   2019-08-09 13:00:00.000000000 +0200
@@ -3,7 +3,7 @@
 
 setup(
     name='w3lib',
-    version='1.20.0',
+    version='1.21.0',
     license='BSD',
     description='Library of web-related functions',
     author='Scrapy project',
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/tests/test_html.py 
new/w3lib-1.21.0/tests/test_html.py
--- old/w3lib-1.20.0/tests/test_html.py 2019-01-11 15:01:17.000000000 +0100
+++ new/w3lib-1.21.0/tests/test_html.py 2019-08-09 13:00:00.000000000 +0200
@@ -106,6 +106,8 @@
         self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"), 
u'test  whatever')
         self.assertEqual(remove_comments(b"test <!--\ntextcoment\n--> 
whatever"), u'test  whatever')
 
+        self.assertEqual(remove_comments(b"test <!--"), u'test ')
+
 
 class RemoveTagsTest(unittest.TestCase):
     def test_returns_unicode(self):
@@ -184,6 +186,10 @@
         # text with empty tags
         self.assertEqual(remove_tags_with_content(u'<br/>a<br />', 
which_ones=('br',)), u'a')
 
+    def test_tags_with_shared_prefix(self):
+        # https://github.com/scrapy/w3lib/issues/114
+        self.assertEqual(remove_tags_with_content(u'<span></span><s></s>', 
which_ones=('s',)), u'<span></span>')
+
 
 class ReplaceEscapeCharsTest(unittest.TestCase):
     def test_returns_unicode(self):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/tests/test_url.py 
new/w3lib-1.21.0/tests/test_url.py
--- old/w3lib-1.20.0/tests/test_url.py  2019-01-11 15:01:17.000000000 +0100
+++ new/w3lib-1.21.0/tests/test_url.py  2019-08-09 13:00:00.000000000 +0200
@@ -59,6 +59,20 @@
 
         self.assertTrue(isinstance(safe_url_string(b'http://example.com/'), 
str))
 
+    def test_safe_url_string_remove_ascii_tab_and_newlines(self):
+        self.assertEqual(safe_url_string("http://example.com/test\n.html";),
+                                         "http://example.com/test.html";)
+        self.assertEqual(safe_url_string("http://example.com/test\t.html";),
+                                         "http://example.com/test.html";)
+        self.assertEqual(safe_url_string("http://example.com/test\r.html";),
+                                         "http://example.com/test.html";)
+        self.assertEqual(safe_url_string("http://example.com/test\r.html\n";),
+                                         "http://example.com/test.html";)
+        self.assertEqual(safe_url_string("http://example.com/test\r\n.html\t";),
+                                         "http://example.com/test.html";)
+        self.assertEqual(safe_url_string("http://example.com/test\a\n.html";),
+                                         "http://example.com/test%07.html";)
+
     def test_safe_url_string_unsafe_chars(self):
         safeurl = 
safe_url_string(r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|")
         self.assertEqual(safeurl, 
r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|")
@@ -203,6 +217,19 @@
                          'http://www.example.org/image')
         self.assertEqual(safe_download_url('http://www.example.org/dir/'),
                          'http://www.example.org/dir/')
+        self.assertEqual(safe_download_url(b'http://www.example.org/dir/'),
+                         'http://www.example.org/dir/')
+
+        # Encoding related tests
+        self.assertEqual(safe_download_url(b'http://www.example.org?\xa3',
+                         encoding='latin-1', path_encoding='latin-1'),
+                         'http://www.example.org/?%A3')
+        self.assertEqual(safe_download_url(b'http://www.example.org?\xc2\xa3',
+                         encoding='utf-8', path_encoding='utf-8'),
+                         'http://www.example.org/?%C2%A3')
+        
self.assertEqual(safe_download_url(b'http://www.example.org/\xc2\xa3?\xc2\xa3',
+                         encoding='utf-8', path_encoding='latin-1'),
+                         'http://www.example.org/%A3?%C2%A3')
 
     def test_is_url(self):
         self.assertTrue(is_url('http://www.example.org'))
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/w3lib/__init__.py 
new/w3lib-1.21.0/w3lib/__init__.py
--- old/w3lib-1.20.0/w3lib/__init__.py  2019-01-11 15:01:17.000000000 +0100
+++ new/w3lib-1.21.0/w3lib/__init__.py  2019-08-09 13:00:00.000000000 +0200
@@ -1,3 +1,3 @@
-__version__ = "1.20.0"
+__version__ = "1.21.0"
 version_info = tuple(int(v) if v.isdigit() else v
                      for v in __version__.split('.'))
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/w3lib/html.py 
new/w3lib-1.21.0/w3lib/html.py
--- old/w3lib-1.20.0/w3lib/html.py      2019-01-11 15:01:17.000000000 +0100
+++ new/w3lib-1.21.0/w3lib/html.py      2019-08-09 13:00:00.000000000 +0200
@@ -122,7 +122,7 @@
     return _tag_re.sub(token, to_unicode(text, encoding))
 
 
-_REMOVECOMMENTS_RE = re.compile(u'<!--.*?-->', re.DOTALL)
+_REMOVECOMMENTS_RE = re.compile(u'<!--.*?(?:-->|$)', re.DOTALL)
 def remove_comments(text, encoding=None):
     """ Remove HTML Comments.
 
@@ -220,7 +220,7 @@
 
     text = to_unicode(text, encoding)
     if which_ones:
-        tags = '|'.join([r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in 
which_ones])
+        tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag 
in which_ones])
         retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
         text = retags.sub(u'', text)
     return text
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/w3lib/url.py 
new/w3lib-1.21.0/w3lib/url.py
--- old/w3lib-1.20.0/w3lib/url.py       2019-01-11 15:01:17.000000000 +0100
+++ new/w3lib-1.21.0/w3lib/url.py       2019-08-09 13:00:00.000000000 +0200
@@ -34,9 +34,12 @@
 
 _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
 
+_ascii_tab_newline_re = re.compile(r'[\t\n\r]')  # see 
https://infra.spec.whatwg.org/#ascii-tab-or-newline
+
 def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
     """Convert the given URL into a legal URL by escaping unsafe characters
-    according to RFC-3986.
+    according to RFC-3986. Also, ASCII tabs and newlines are removed
+    as per https://url.spec.whatwg.org/#url-parsing.
 
     If a bytes URL is given, it is first converted to `str` using the given
     encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
@@ -56,8 +59,8 @@
     #     encoded with the supplied encoding (or UTF8 by default)
     #   - if the supplied (or default) encoding chokes,
     #     percent-encode offending bytes
-    parts = urlsplit(to_unicode(url, encoding=encoding,
-                                errors='percentencode'))
+    decoded = to_unicode(url, encoding=encoding, errors='percentencode')
+    parts = urlsplit(_ascii_tab_newline_re.sub('', decoded))
 
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
@@ -84,7 +87,7 @@
 
 _parent_dirs = re.compile(r'/?(\.\./)+')
 
-def safe_download_url(url):
+def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
     """ Make a url for download. This will call safe_url_string
     and then strip the fragment, if one exists. The path will
     be normalised.
@@ -92,11 +95,11 @@
     If the path is outside the document root, it will be changed
     to be within the document root.
     """
-    safe_url = safe_url_string(url)
+    safe_url = safe_url_string(url, encoding, path_encoding)
     scheme, netloc, path, query, _ = urlsplit(safe_url)
     if path:
         path = _parent_dirs.sub('', posixpath.normpath(path))
-        if url.endswith('/') and not path.endswith('/'):
+        if safe_url.endswith('/') and not path.endswith('/'):
             path += '/'
     else:
         path = '/'
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/w3lib-1.20.0/w3lib.egg-info/PKG-INFO 
new/w3lib-1.21.0/w3lib.egg-info/PKG-INFO
--- old/w3lib-1.20.0/w3lib.egg-info/PKG-INFO    2019-01-11 15:01:52.000000000 
+0100
+++ new/w3lib-1.21.0/w3lib.egg-info/PKG-INFO    2019-08-09 13:00:36.000000000 
+0200
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: w3lib
-Version: 1.20.0
+Version: 1.21.0
 Summary: Library of web-related functions
 Home-page: https://github.com/scrapy/w3lib
 Author: Scrapy project

commit python-w3lib for openSUSE:Factory

Reply via email to