commit python-urlextract for openSUSE:Factory

Source-Sync Sun, 30 Oct 2022 10:29:46 -0700

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package python-urlextract for 
openSUSE:Factory checked in at 2022-10-30 18:29:09
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-urlextract (Old)
 and      /work/SRC/openSUSE:Factory/.python-urlextract.new.2275 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-urlextract"

Sun Oct 30 18:29:09 2022 rev:4 rq:1032275 version:1.7.0

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-urlextract/python-urlextract.changes      
2022-08-17 18:26:24.095620815 +0200
+++ 
/work/SRC/openSUSE:Factory/.python-urlextract.new.2275/python-urlextract.changes
    2022-10-30 18:29:35.274628167 +0100
@@ -1,0 +2,9 @@
+Sat Oct 29 16:25:27 UTC 2022 - Yogalakshmi Arunachalam <[email protected]>
+
+- Update to v1.7.0
+  * correct handling when authority starts with @ symbol
+  * remove unreserved characters from the beginning of found URL
+  * added typing and mypy checkcs - by mimi89999
+  * updated list of TLDs
+
+-------------------------------------------------------------------

Old:
----
  urlextract-1.6.0.tar.gz

New:
----
  urlextract-1.7.0.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-urlextract.spec ++++++
--- /var/tmp/diff_new_pack.p5I030/_old  2022-10-30 18:29:35.698630458 +0100
+++ /var/tmp/diff_new_pack.p5I030/_new  2022-10-30 18:29:35.706630500 +0100
@@ -19,7 +19,7 @@
 %{?!python_module:%define python_module() python-%{**} python3-%{**}}
 %define skip_python2 1
 Name:           python-urlextract
-Version:        1.6.0
+Version:        1.7.0
 Release:        0
 Summary:        Collects and extracts URLs from given text
 License:        MIT

++++++ urlextract-1.6.0.tar.gz -> urlextract-1.7.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/.bumpversion.cfg 
new/URLExtract-1.7.0/.bumpversion.cfg
--- old/URLExtract-1.6.0/.bumpversion.cfg       2022-05-17 21:56:52.000000000 
+0200
+++ new/URLExtract-1.7.0/.bumpversion.cfg       2022-10-22 19:41:56.000000000 
+0200
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.6.0
+current_version = 1.7.0
 commit = True
 tag = True
 message = Version {new_version}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/CHANGELOG.rst 
new/URLExtract-1.7.0/CHANGELOG.rst
--- old/URLExtract-1.6.0/CHANGELOG.rst  2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/CHANGELOG.rst  2022-10-22 19:41:56.000000000 +0200
@@ -2,6 +2,12 @@
 ~~~~~~~~~
 - N/A
 
+- 1.7.0 (2022-10-22)
+    - correct handling when authority starts with @ symbol
+    - remove unreserved characters from the beginning of found URL
+    - added typing and mypy checkcs - by mimi89999
+    - updated list of TLDs
+
 - 1.6.0 (2022-05-17)
     - Add a list of URLs allowed to extract (issue #125) - by khoben
     - correct order of actual and expected in tests
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/README.rst 
new/URLExtract-1.7.0/README.rst
--- old/URLExtract-1.6.0/README.rst     2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/README.rst     2022-10-22 19:41:56.000000000 +0200
@@ -4,8 +4,8 @@
 URLExtract is python class for collecting (extracting) URLs from given
 text based on locating TLD.
 
-.. image:: https://img.shields.io/travis/lipoja/URLExtract/master.svg
-    :target: https://travis-ci.org/lipoja/URLExtract
+.. image:: 
https://img.shields.io/github/workflow/status/lipoja/URLExtract/Upload%20Python%20Package
+    :target: 
https://github.com/lipoja/URLExtract/actions/workflows/python-publish.yml
     :alt: Build Status
 .. image:: https://img.shields.io/github/tag/lipoja/URLExtract.svg
     :target: https://github.com/lipoja/URLExtract/tags
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/setup.py 
new/URLExtract-1.7.0/setup.py
--- old/URLExtract-1.6.0/setup.py       2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/setup.py       2022-10-22 19:41:56.000000000 +0200
@@ -16,7 +16,7 @@
 
 # version of URLExtract
 # (do not forget to change it in urlextract_core.py as well)
-__version__ = "1.6.0"
+__version__ = "1.7.0"
 
 
 def read(readme):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/tests/unit/test_extract_email.py 
new/URLExtract-1.7.0/tests/unit/test_extract_email.py
--- old/URLExtract-1.6.0/tests/unit/test_extract_email.py       2022-05-17 
21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/tests/unit/test_extract_email.py       2022-10-22 
19:41:56.000000000 +0200
@@ -34,6 +34,7 @@
     [
         ("Do not extract emails by default [email protected]", 
["[email protected]"]),
         ("<[email protected]>", ["[email protected]"]),
+        ("whitespace @address.net>", []),
         ("Given URIs are not mail [email protected]/asdasd 
[email protected]:1234", []),
         ("Given URIs are not mail [email protected]?not [email protected]#not", 
[]),
     ],
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/tests/unit/test_find_urls.py 
new/URLExtract-1.7.0/tests/unit/test_find_urls.py
--- old/URLExtract-1.6.0/tests/unit/test_find_urls.py   2022-05-17 
21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/tests/unit/test_find_urls.py   2022-10-22 
19:41:56.000000000 +0200
@@ -37,8 +37,8 @@
             ["https://example.com/what.com";],
         ),
         (
-            "https://i2.wp.com/siliconfilter.com/2011/06/example.jpg";,
-            ["https://i2.wp.com/siliconfilter.com/2011/06/example.jpg";],
+            "* test link -https://www.example.com";,
+            ["https://www.example.com";],
         ),
         (
             "https://www.test.org/paper/apostrophe'in-url",
@@ -57,6 +57,7 @@
             "<script src='//www.example.com/somejsfile.js'>",
             ["www.example.com/somejsfile.js"],
         ),
+        ("bad.email @address.net>", ['bad.email']),
     ],
 )
 def test_find_urls(urlextract, text, expected):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/tox.ini new/URLExtract-1.7.0/tox.ini
--- old/URLExtract-1.6.0/tox.ini        2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/tox.ini        2022-10-22 19:41:56.000000000 +0200
@@ -3,6 +3,7 @@
   py-{nocache,cache}
   black
   flake8
+  mypy
 skip_missing_interpreters = true
 
 [testenv]
@@ -30,3 +31,9 @@
   black urlextract --check --skip-string-normalization
   black tests --check --skip-string-normalization
   black setup.py --check --skip-string-normalization
+
+[testenv:mypy]
+deps =
+  mypy
+commands =
+  mypy --install-types --non-interactive --namespace-packages urlextract
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/urlextract/cachefile.py 
new/URLExtract-1.7.0/urlextract/cachefile.py
--- old/URLExtract-1.6.0/urlextract/cachefile.py        2022-05-17 
21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/urlextract/cachefile.py        2022-10-22 
19:41:56.000000000 +0200
@@ -12,10 +12,12 @@
 import os
 import tempfile
 import urllib.request
+from typing import Set, Iterable, Tuple, List, Union, NoReturn
+
 from datetime import datetime
 from urllib.error import URLError, HTTPError
 
-import idna
+import idna  # type: ignore
 import filelock
 from platformdirs import user_cache_dir
 
@@ -61,7 +63,7 @@
             self._tld_list_path = self._get_default_cache_file_path()
             self._default_cache_file = True
 
-    def _get_default_cache_dir(self):
+    def _get_default_cache_dir(self) -> str:
         """
         Returns default cache directory (data directory)
 
@@ -72,7 +74,7 @@
 
         return os.path.join(os.path.dirname(__file__), self._DATA_DIR)
 
-    def _get_default_cache_file_path(self):
+    def _get_default_cache_file_path(self) -> str:
         """
         Returns default cache file path
 
@@ -91,7 +93,7 @@
 
         return default_list_path
 
-    def _get_writable_cache_dir(self):
+    def _get_writable_cache_dir(self) -> str:
         """
         Get writable cache directory with fallback to user's cache directory
         and global temp directory
@@ -124,7 +126,7 @@
 
         raise CacheFileError("Cache directories are not writable.")
 
-    def _get_cache_file_path(self):
+    def _get_cache_file_path(self) -> str:
         """
         Get path for cache file
 
@@ -148,7 +150,7 @@
         # get path for cached file
         return os.path.join(cache_dir, self._CACHE_FILE_NAME)
 
-    def _get_cache_lock_file_path(self):
+    def _get_cache_lock_file_path(self) -> str:
         """
         Get path for cache file lock
 
@@ -158,7 +160,7 @@
         """
         return self._get_cache_file_path() + ".lock"
 
-    def _download_tlds_list(self):
+    def _download_tlds_list(self) -> bool:
         """
         Function downloads list of TLDs from IANA.
         LINK: https://data.iana.org/TLD/tlds-alpha-by-domain.txt
@@ -215,7 +217,7 @@
 
         return True
 
-    def _load_cached_tlds(self):
+    def _load_cached_tlds(self) -> Set[str]:
         """
         Loads TLDs from cached file to set.
 
@@ -231,7 +233,7 @@
             )
             raise CacheFileError("Cached file is not readable for current 
user.")
 
-        set_of_tlds = set()
+        set_of_tlds: Set[str] = set()
 
         with filelock.FileLock(self._get_cache_lock_file_path()):
             with open(self._tld_list_path, "r") as f_cache_tld:
@@ -249,7 +251,7 @@
 
         return set_of_tlds
 
-    def _get_last_cachefile_modification(self):
+    def _get_last_cachefile_modification(self) -> Union[datetime, None]:
         """
         Get last modification of cache file with TLDs.
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/URLExtract-1.6.0/urlextract/data/tlds-alpha-by-domain.txt 
new/URLExtract-1.7.0/urlextract/data/tlds-alpha-by-domain.txt
--- old/URLExtract-1.6.0/urlextract/data/tlds-alpha-by-domain.txt       
2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/urlextract/data/tlds-alpha-by-domain.txt       
2022-10-22 19:41:56.000000000 +0200
@@ -1,4 +1,4 @@
-# Version 2022051700, Last Updated Tue May 17 07:07:01 2022 UTC
+# Version 2022102200, Last Updated Sat Oct 22 07:07:01 2022 UTC
 AAA
 AARP
 ABARTH
@@ -176,7 +176,6 @@
 BRUSSELS
 BS
 BT
-BUGATTI
 BUILD
 BUILDERS
 BUSINESS
@@ -196,7 +195,6 @@
 CAM
 CAMERA
 CAMP
-CANCERRESEARCH
 CANON
 CAPETOWN
 CAPITAL
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/URLExtract-1.6.0/urlextract/urlextract_core.py 
new/URLExtract-1.7.0/urlextract/urlextract_core.py
--- old/URLExtract-1.6.0/urlextract/urlextract_core.py  2022-05-17 
21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/urlextract/urlextract_core.py  2022-10-22 
19:41:56.000000000 +0200
@@ -8,22 +8,24 @@
 .. codeauthor:: Jan Lipovsk?? <[email protected]>, janlipovsky.cz
 .. contributors: https://github.com/lipoja/URLExtract/graphs/contributors
 """
+from argparse import Namespace
 import functools
 import ipaddress
 import logging
 import re
 import socket
+from typing import Set, Iterable, Tuple, List, Union, NoReturn, Generator
 import string
 import sys
 from collections import OrderedDict
 from datetime import datetime, timedelta
 
-import uritools
+import uritools  # type: ignore
 
 from urlextract.cachefile import CacheFile, CacheFileError
 
 # version of URLExtract (do not forget to change it in setup.py as well)
-__version__ = "1.6.0"
+__version__ = "1.7.0"
 
 # default value for maximum count of processed URLs by find_url
 DEFAULT_LIMIT = 10000
@@ -67,8 +69,8 @@
     }
 
     _ipv4_tld = [".{}".format(ip) for ip in reversed(range(256))]
-    _ignore_list = set()
-    _permit_list = set()
+    _ignore_list: Set[str] = set()
+    _permit_list: Set[str] = set()
 
     _limit = DEFAULT_LIMIT
 
@@ -116,7 +118,7 @@
         # characters that are allowed to be right after TLD
         self._after_tld_chars = self._get_after_tld_chars()
 
-    def _get_after_tld_chars(self):
+    def _get_after_tld_chars(self) -> Set[str]:
         """Initialize after tld characters"""
         after_tld_chars = set(string.whitespace)
         after_tld_chars |= {"/", '"', "'", "<", ">", "?", ":", ".", ","}
@@ -142,7 +144,7 @@
         self._tlds_re = re.compile("|".join(re_escaped), flags=re.IGNORECASE)
 
     @property
-    def extract_email(self):
+    def extract_email(self) -> bool:
         """
         If set to True email will be extracted from text
 
@@ -151,7 +153,7 @@
         return self._extract_email
 
     @extract_email.setter
-    def extract_email(self, extract):
+    def extract_email(self, extract: bool):
         """
         Set if emails will be extracted from text
 
@@ -160,7 +162,7 @@
         self._extract_email = extract
 
     @property
-    def extract_localhost(self):
+    def extract_localhost(self) -> bool:
         """
         If set to True 'localhost' will be extracted as URL from text
 
@@ -169,7 +171,7 @@
         return self._extract_localhost
 
     @extract_localhost.setter
-    def extract_localhost(self, enable):
+    def extract_localhost(self, enable: bool):
         """
         Set if 'localhost' will be extracted as URL from text
 
@@ -179,7 +181,7 @@
         self._extract_localhost = enable
 
     @property
-    def ignore_list(self):
+    def ignore_list(self) -> Set[str]:
         """
         Returns set of URLs on ignore list
 
@@ -189,7 +191,7 @@
         return self._ignore_list
 
     @ignore_list.setter
-    def ignore_list(self, ignore_list):
+    def ignore_list(self, ignore_list: Set[str]):
         """
         Set of URLs to be ignored (not returned) while extracting from text
 
@@ -256,7 +258,7 @@
 
         return True
 
-    def update_when_older(self, days):
+    def update_when_older(self, days: int) -> bool:
         """
         Update TLD list cache file if the list is older than
         number of days given in parameter `days` or if does not exist.
@@ -278,7 +280,7 @@
         return True
 
     @staticmethod
-    def get_version():
+    def get_version() -> str:
         """
         Returns version number.
 
@@ -288,7 +290,7 @@
 
         return __version__
 
-    def get_after_tld_chars(self):
+    def get_after_tld_chars(self) -> List[str]:
         """
         Returns list of chars that are allowed after TLD
 
@@ -298,7 +300,7 @@
 
         return list(self._after_tld_chars)
 
-    def set_after_tld_chars(self, after_tld_chars):
+    def set_after_tld_chars(self, after_tld_chars: Iterable[str]):
         """
         Set chars that are allowed after TLD.
 
@@ -307,7 +309,7 @@
 
         self._after_tld_chars = set(after_tld_chars)
 
-    def get_stop_chars_left(self):
+    def get_stop_chars_left(self) -> Set[str]:
         """
         Returns set of stop chars for text on left from TLD.
 
@@ -316,7 +318,7 @@
         """
         return self._stop_chars_left
 
-    def set_stop_chars_left(self, stop_chars):
+    def set_stop_chars_left(self, stop_chars: Set[str]):
         """
         Set stop characters for text on left from TLD.
         Stop characters are used when determining end of URL.
@@ -332,7 +334,7 @@
 
         self._stop_chars_left = stop_chars
 
-    def get_stop_chars_right(self):
+    def get_stop_chars_right(self) -> Set[str]:
         """
         Returns set of stop chars for text on right from TLD.
 
@@ -341,7 +343,7 @@
         """
         return self._stop_chars_right
 
-    def set_stop_chars_right(self, stop_chars):
+    def set_stop_chars_right(self, stop_chars: Set[str]):
         """
         Set stop characters for text on right from TLD.
         Stop characters are used when determining end of URL.
@@ -357,7 +359,7 @@
 
         self._stop_chars_right = stop_chars
 
-    def get_enclosures(self):
+    def get_enclosures(self) -> Set[Tuple[str, str]]:
         """
         Returns set of enclosure pairs that might be used to enclosure URL.
         For example brackets (example.com), [example.com], {example.com}
@@ -367,7 +369,7 @@
         """
         return self._enclosure
 
-    def add_enclosure(self, left_char, right_char):
+    def add_enclosure(self, left_char: str, right_char: str):
         """
         Add new enclosure pair of characters. That and should be removed
         when their presence is detected at beginning and end of found URL
@@ -381,7 +383,7 @@
 
         self._after_tld_chars = self._get_after_tld_chars()
 
-    def remove_enclosure(self, left_char, right_char):
+    def remove_enclosure(self, left_char: str, right_char: str):
         """
         Remove enclosure pair from set of enclosures.
 
@@ -397,8 +399,8 @@
         self._after_tld_chars = self._get_after_tld_chars()
 
     def _complete_url(
-        self, text, tld_pos, tld, check_dns=False, with_schema_only=False
-    ):
+        self, text: str, tld_pos: int, tld: str, check_dns=False, 
with_schema_only=False
+    ) -> str:
         """
         Expand string in both sides to match whole URL.
 
@@ -486,6 +488,9 @@
         # URL should not start with two backslashes
         if complete_url.startswith("//"):
             complete_url = complete_url[2:]
+        # URL should not start with unreserved characters
+        if complete_url.startswith(("-", ".", "~", "_")):
+            complete_url = complete_url[1:]
         if not self._is_domain_valid(
             complete_url, tld, check_dns=check_dns, 
with_schema_only=with_schema_only
         ):
@@ -493,7 +498,7 @@
 
         return complete_url
 
-    def _validate_tld_match(self, text, matched_tld, tld_pos):
+    def _validate_tld_match(self, text: str, matched_tld: str, tld_pos: int) 
-> bool:
         """
         Validate TLD match - tells if at found position is really TLD.
 
@@ -517,7 +522,9 @@
 
         return False
 
-    def _is_domain_valid(self, url, tld, check_dns=False, 
with_schema_only=False):
+    def _is_domain_valid(
+        self, url: str, tld: str, check_dns=False, with_schema_only=False
+    ):
         """
         Checks if given URL has valid domain name (ignores subdomains)
 
@@ -570,6 +577,10 @@
         url_parts = uritools.urisplit(url)
         # <scheme>://<authority>/<path>?<query>#<fragment>
 
+        # authority can't start with @
+        if url_parts.authority.startswith('@'):
+            return False
+
         # if URI contains user info and schema was automatically added
         # the url is probably an email
         if url_parts.getuserinfo() and added_schema:
@@ -653,7 +664,7 @@
 
         return True
 
-    def _remove_enclosure_from_url(self, text_url, tld_pos, tld):
+    def _remove_enclosure_from_url(self, text_url: str, tld_pos: int, tld: 
str) -> str:
         """
         Removes enclosure characters from URL given in text_url.
         For example: (example.com) -> example.com
@@ -707,7 +718,7 @@
         return new_url
 
     @staticmethod
-    def _split_markdown(text_url, tld_pos):
+    def _split_markdown(text_url: str, tld_pos: int) -> str:
         """
         Split markdown URL. There is an issue wen Markdown URL is found.
         Parsing of the URL does not stop on right place so wrongly found URL
@@ -736,7 +747,8 @@
         return text_url
 
     @staticmethod
-    def _get_tld_pos(url, tld):
+    # TODO: fix DOC to accomodate to return value
+    def _get_tld_pos(url: str, tld: str) -> int:
         """
         Return position of TLD in hostname.
 
@@ -751,9 +763,11 @@
         offset = url.find(host)
         return host.rfind(tld) + offset
 
+    # TODO: move type assertion to be Generator based
+    # found https://stackoverflow.com/a/38423388/14669675
     def gen_urls(
-        self, text, check_dns=False, get_indices=False, with_schema_only=False
-    ):
+        self, text: str, check_dns=False, get_indices=False, 
with_schema_only=False
+    ) -> Generator[Union[str, Tuple[str, Tuple[int, int]]], None, None]:
         """
         Creates generator over found URLs in given text.
 
@@ -814,12 +828,12 @@
 
     def find_urls(
         self,
-        text,
+        text: str,
         only_unique=False,
         check_dns=False,
         get_indices=False,
         with_schema_only=False,
-    ):
+    ) -> List[Union[str, Tuple[str, Tuple[int, int]]]]:
         """
         Find all URLs in given text.
 
@@ -847,7 +861,7 @@
                 return list(OrderedDict.fromkeys(urls))
             return list(urls)
 
-        result_urls = []
+        result_urls: List[Union[str, Tuple[str, Tuple[int, int]]]] = []
         url = next(urls, "")
         url_count = 1
         while url:
@@ -867,7 +881,7 @@
             return list(OrderedDict.fromkeys(result_urls))
         return result_urls
 
-    def has_urls(self, text, check_dns=False, with_schema_only=False):
+    def has_urls(self, text: str, check_dns=False, with_schema_only=False) -> 
bool:
         """
         Checks if text contains any valid URL.
         Returns True if text contains at least one URL.
@@ -928,7 +942,8 @@
     """
     import argparse
 
-    def get_args():
+    # TODO: add type checking here
+    def get_args() -> Namespace:
         """Parse programs arguments"""
         parser = argparse.ArgumentParser(
             description="urlextract - prints out all URLs that were "
@@ -1046,10 +1061,10 @@
         args.input_file.close()
 
 
-def dns_cache_install():
+def dns_cache_install() -> None:
     try:
-        from dns import resolver as dnspython_resolver_module
-        from dns_cache.resolver import ExceptionCachingResolver
+        from dns import resolver as dnspython_resolver_module  # type: ignore
+        from dns_cache.resolver import ExceptionCachingResolver  # type: ignore
 
         if not dnspython_resolver_module.default_resolver:
             dnspython_resolver_module.default_resolver = 
ExceptionCachingResolver()
@@ -1058,7 +1073,7 @@
         pass
 
     try:
-        from dns.resolver import (
+        from dns.resolver import (  # type: ignore
             LRUCache,
             Resolver,
             _resolver,

commit python-urlextract for openSUSE:Factory

Reply via email to