Script 'mail_helper' called by obssrc
Hello community,
here is the log from the commit of package python-urlextract for
openSUSE:Factory checked in at 2022-10-30 18:29:09
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-urlextract (Old)
and /work/SRC/openSUSE:Factory/.python-urlextract.new.2275 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-urlextract"
Sun Oct 30 18:29:09 2022 rev:4 rq:1032275 version:1.7.0
Changes:
--------
--- /work/SRC/openSUSE:Factory/python-urlextract/python-urlextract.changes
2022-08-17 18:26:24.095620815 +0200
+++
/work/SRC/openSUSE:Factory/.python-urlextract.new.2275/python-urlextract.changes
2022-10-30 18:29:35.274628167 +0100
@@ -1,0 +2,9 @@
+Sat Oct 29 16:25:27 UTC 2022 - Yogalakshmi Arunachalam <[email protected]>
+
+- Update to v1.7.0
+ * correct handling when authority starts with @ symbol
+ * remove unreserved characters from the beginning of found URL
+ * added typing and mypy checkcs - by mimi89999
+ * updated list of TLDs
+
+-------------------------------------------------------------------
Old:
----
urlextract-1.6.0.tar.gz
New:
----
urlextract-1.7.0.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ python-urlextract.spec ++++++
--- /var/tmp/diff_new_pack.p5I030/_old 2022-10-30 18:29:35.698630458 +0100
+++ /var/tmp/diff_new_pack.p5I030/_new 2022-10-30 18:29:35.706630500 +0100
@@ -19,7 +19,7 @@
%{?!python_module:%define python_module() python-%{**} python3-%{**}}
%define skip_python2 1
Name: python-urlextract
-Version: 1.6.0
+Version: 1.7.0
Release: 0
Summary: Collects and extracts URLs from given text
License: MIT
++++++ urlextract-1.6.0.tar.gz -> urlextract-1.7.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/.bumpversion.cfg
new/URLExtract-1.7.0/.bumpversion.cfg
--- old/URLExtract-1.6.0/.bumpversion.cfg 2022-05-17 21:56:52.000000000
+0200
+++ new/URLExtract-1.7.0/.bumpversion.cfg 2022-10-22 19:41:56.000000000
+0200
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 1.6.0
+current_version = 1.7.0
commit = True
tag = True
message = Version {new_version}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/CHANGELOG.rst
new/URLExtract-1.7.0/CHANGELOG.rst
--- old/URLExtract-1.6.0/CHANGELOG.rst 2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/CHANGELOG.rst 2022-10-22 19:41:56.000000000 +0200
@@ -2,6 +2,12 @@
~~~~~~~~~
- N/A
+- 1.7.0 (2022-10-22)
+ - correct handling when authority starts with @ symbol
+ - remove unreserved characters from the beginning of found URL
+ - added typing and mypy checkcs - by mimi89999
+ - updated list of TLDs
+
- 1.6.0 (2022-05-17)
- Add a list of URLs allowed to extract (issue #125) - by khoben
- correct order of actual and expected in tests
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/README.rst
new/URLExtract-1.7.0/README.rst
--- old/URLExtract-1.6.0/README.rst 2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/README.rst 2022-10-22 19:41:56.000000000 +0200
@@ -4,8 +4,8 @@
URLExtract is python class for collecting (extracting) URLs from given
text based on locating TLD.
-.. image:: https://img.shields.io/travis/lipoja/URLExtract/master.svg
- :target: https://travis-ci.org/lipoja/URLExtract
+.. image::
https://img.shields.io/github/workflow/status/lipoja/URLExtract/Upload%20Python%20Package
+ :target:
https://github.com/lipoja/URLExtract/actions/workflows/python-publish.yml
:alt: Build Status
.. image:: https://img.shields.io/github/tag/lipoja/URLExtract.svg
:target: https://github.com/lipoja/URLExtract/tags
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/setup.py
new/URLExtract-1.7.0/setup.py
--- old/URLExtract-1.6.0/setup.py 2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/setup.py 2022-10-22 19:41:56.000000000 +0200
@@ -16,7 +16,7 @@
# version of URLExtract
# (do not forget to change it in urlextract_core.py as well)
-__version__ = "1.6.0"
+__version__ = "1.7.0"
def read(readme):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/tests/unit/test_extract_email.py
new/URLExtract-1.7.0/tests/unit/test_extract_email.py
--- old/URLExtract-1.6.0/tests/unit/test_extract_email.py 2022-05-17
21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/tests/unit/test_extract_email.py 2022-10-22
19:41:56.000000000 +0200
@@ -34,6 +34,7 @@
[
("Do not extract emails by default [email protected]",
["[email protected]"]),
("<[email protected]>", ["[email protected]"]),
+ ("whitespace @address.net>", []),
("Given URIs are not mail [email protected]/asdasd
[email protected]:1234", []),
("Given URIs are not mail [email protected]?not [email protected]#not",
[]),
],
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/tests/unit/test_find_urls.py
new/URLExtract-1.7.0/tests/unit/test_find_urls.py
--- old/URLExtract-1.6.0/tests/unit/test_find_urls.py 2022-05-17
21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/tests/unit/test_find_urls.py 2022-10-22
19:41:56.000000000 +0200
@@ -37,8 +37,8 @@
["https://example.com/what.com"],
),
(
- "https://i2.wp.com/siliconfilter.com/2011/06/example.jpg",
- ["https://i2.wp.com/siliconfilter.com/2011/06/example.jpg"],
+ "* test link -https://www.example.com",
+ ["https://www.example.com"],
),
(
"https://www.test.org/paper/apostrophe'in-url",
@@ -57,6 +57,7 @@
"<script src='//www.example.com/somejsfile.js'>",
["www.example.com/somejsfile.js"],
),
+ ("bad.email @address.net>", ['bad.email']),
],
)
def test_find_urls(urlextract, text, expected):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/tox.ini new/URLExtract-1.7.0/tox.ini
--- old/URLExtract-1.6.0/tox.ini 2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/tox.ini 2022-10-22 19:41:56.000000000 +0200
@@ -3,6 +3,7 @@
py-{nocache,cache}
black
flake8
+ mypy
skip_missing_interpreters = true
[testenv]
@@ -30,3 +31,9 @@
black urlextract --check --skip-string-normalization
black tests --check --skip-string-normalization
black setup.py --check --skip-string-normalization
+
+[testenv:mypy]
+deps =
+ mypy
+commands =
+ mypy --install-types --non-interactive --namespace-packages urlextract
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/urlextract/cachefile.py
new/URLExtract-1.7.0/urlextract/cachefile.py
--- old/URLExtract-1.6.0/urlextract/cachefile.py 2022-05-17
21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/urlextract/cachefile.py 2022-10-22
19:41:56.000000000 +0200
@@ -12,10 +12,12 @@
import os
import tempfile
import urllib.request
+from typing import Set, Iterable, Tuple, List, Union, NoReturn
+
from datetime import datetime
from urllib.error import URLError, HTTPError
-import idna
+import idna # type: ignore
import filelock
from platformdirs import user_cache_dir
@@ -61,7 +63,7 @@
self._tld_list_path = self._get_default_cache_file_path()
self._default_cache_file = True
- def _get_default_cache_dir(self):
+ def _get_default_cache_dir(self) -> str:
"""
Returns default cache directory (data directory)
@@ -72,7 +74,7 @@
return os.path.join(os.path.dirname(__file__), self._DATA_DIR)
- def _get_default_cache_file_path(self):
+ def _get_default_cache_file_path(self) -> str:
"""
Returns default cache file path
@@ -91,7 +93,7 @@
return default_list_path
- def _get_writable_cache_dir(self):
+ def _get_writable_cache_dir(self) -> str:
"""
Get writable cache directory with fallback to user's cache directory
and global temp directory
@@ -124,7 +126,7 @@
raise CacheFileError("Cache directories are not writable.")
- def _get_cache_file_path(self):
+ def _get_cache_file_path(self) -> str:
"""
Get path for cache file
@@ -148,7 +150,7 @@
# get path for cached file
return os.path.join(cache_dir, self._CACHE_FILE_NAME)
- def _get_cache_lock_file_path(self):
+ def _get_cache_lock_file_path(self) -> str:
"""
Get path for cache file lock
@@ -158,7 +160,7 @@
"""
return self._get_cache_file_path() + ".lock"
- def _download_tlds_list(self):
+ def _download_tlds_list(self) -> bool:
"""
Function downloads list of TLDs from IANA.
LINK: https://data.iana.org/TLD/tlds-alpha-by-domain.txt
@@ -215,7 +217,7 @@
return True
- def _load_cached_tlds(self):
+ def _load_cached_tlds(self) -> Set[str]:
"""
Loads TLDs from cached file to set.
@@ -231,7 +233,7 @@
)
raise CacheFileError("Cached file is not readable for current
user.")
- set_of_tlds = set()
+ set_of_tlds: Set[str] = set()
with filelock.FileLock(self._get_cache_lock_file_path()):
with open(self._tld_list_path, "r") as f_cache_tld:
@@ -249,7 +251,7 @@
return set_of_tlds
- def _get_last_cachefile_modification(self):
+ def _get_last_cachefile_modification(self) -> Union[datetime, None]:
"""
Get last modification of cache file with TLDs.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/URLExtract-1.6.0/urlextract/data/tlds-alpha-by-domain.txt
new/URLExtract-1.7.0/urlextract/data/tlds-alpha-by-domain.txt
--- old/URLExtract-1.6.0/urlextract/data/tlds-alpha-by-domain.txt
2022-05-17 21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/urlextract/data/tlds-alpha-by-domain.txt
2022-10-22 19:41:56.000000000 +0200
@@ -1,4 +1,4 @@
-# Version 2022051700, Last Updated Tue May 17 07:07:01 2022 UTC
+# Version 2022102200, Last Updated Sat Oct 22 07:07:01 2022 UTC
AAA
AARP
ABARTH
@@ -176,7 +176,6 @@
BRUSSELS
BS
BT
-BUGATTI
BUILD
BUILDERS
BUSINESS
@@ -196,7 +195,6 @@
CAM
CAMERA
CAMP
-CANCERRESEARCH
CANON
CAPETOWN
CAPITAL
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/URLExtract-1.6.0/urlextract/urlextract_core.py
new/URLExtract-1.7.0/urlextract/urlextract_core.py
--- old/URLExtract-1.6.0/urlextract/urlextract_core.py 2022-05-17
21:56:52.000000000 +0200
+++ new/URLExtract-1.7.0/urlextract/urlextract_core.py 2022-10-22
19:41:56.000000000 +0200
@@ -8,22 +8,24 @@
.. codeauthor:: Jan Lipovsk?? <[email protected]>, janlipovsky.cz
.. contributors: https://github.com/lipoja/URLExtract/graphs/contributors
"""
+from argparse import Namespace
import functools
import ipaddress
import logging
import re
import socket
+from typing import Set, Iterable, Tuple, List, Union, NoReturn, Generator
import string
import sys
from collections import OrderedDict
from datetime import datetime, timedelta
-import uritools
+import uritools # type: ignore
from urlextract.cachefile import CacheFile, CacheFileError
# version of URLExtract (do not forget to change it in setup.py as well)
-__version__ = "1.6.0"
+__version__ = "1.7.0"
# default value for maximum count of processed URLs by find_url
DEFAULT_LIMIT = 10000
@@ -67,8 +69,8 @@
}
_ipv4_tld = [".{}".format(ip) for ip in reversed(range(256))]
- _ignore_list = set()
- _permit_list = set()
+ _ignore_list: Set[str] = set()
+ _permit_list: Set[str] = set()
_limit = DEFAULT_LIMIT
@@ -116,7 +118,7 @@
# characters that are allowed to be right after TLD
self._after_tld_chars = self._get_after_tld_chars()
- def _get_after_tld_chars(self):
+ def _get_after_tld_chars(self) -> Set[str]:
"""Initialize after tld characters"""
after_tld_chars = set(string.whitespace)
after_tld_chars |= {"/", '"', "'", "<", ">", "?", ":", ".", ","}
@@ -142,7 +144,7 @@
self._tlds_re = re.compile("|".join(re_escaped), flags=re.IGNORECASE)
@property
- def extract_email(self):
+ def extract_email(self) -> bool:
"""
If set to True email will be extracted from text
@@ -151,7 +153,7 @@
return self._extract_email
@extract_email.setter
- def extract_email(self, extract):
+ def extract_email(self, extract: bool):
"""
Set if emails will be extracted from text
@@ -160,7 +162,7 @@
self._extract_email = extract
@property
- def extract_localhost(self):
+ def extract_localhost(self) -> bool:
"""
If set to True 'localhost' will be extracted as URL from text
@@ -169,7 +171,7 @@
return self._extract_localhost
@extract_localhost.setter
- def extract_localhost(self, enable):
+ def extract_localhost(self, enable: bool):
"""
Set if 'localhost' will be extracted as URL from text
@@ -179,7 +181,7 @@
self._extract_localhost = enable
@property
- def ignore_list(self):
+ def ignore_list(self) -> Set[str]:
"""
Returns set of URLs on ignore list
@@ -189,7 +191,7 @@
return self._ignore_list
@ignore_list.setter
- def ignore_list(self, ignore_list):
+ def ignore_list(self, ignore_list: Set[str]):
"""
Set of URLs to be ignored (not returned) while extracting from text
@@ -256,7 +258,7 @@
return True
- def update_when_older(self, days):
+ def update_when_older(self, days: int) -> bool:
"""
Update TLD list cache file if the list is older than
number of days given in parameter `days` or if does not exist.
@@ -278,7 +280,7 @@
return True
@staticmethod
- def get_version():
+ def get_version() -> str:
"""
Returns version number.
@@ -288,7 +290,7 @@
return __version__
- def get_after_tld_chars(self):
+ def get_after_tld_chars(self) -> List[str]:
"""
Returns list of chars that are allowed after TLD
@@ -298,7 +300,7 @@
return list(self._after_tld_chars)
- def set_after_tld_chars(self, after_tld_chars):
+ def set_after_tld_chars(self, after_tld_chars: Iterable[str]):
"""
Set chars that are allowed after TLD.
@@ -307,7 +309,7 @@
self._after_tld_chars = set(after_tld_chars)
- def get_stop_chars_left(self):
+ def get_stop_chars_left(self) -> Set[str]:
"""
Returns set of stop chars for text on left from TLD.
@@ -316,7 +318,7 @@
"""
return self._stop_chars_left
- def set_stop_chars_left(self, stop_chars):
+ def set_stop_chars_left(self, stop_chars: Set[str]):
"""
Set stop characters for text on left from TLD.
Stop characters are used when determining end of URL.
@@ -332,7 +334,7 @@
self._stop_chars_left = stop_chars
- def get_stop_chars_right(self):
+ def get_stop_chars_right(self) -> Set[str]:
"""
Returns set of stop chars for text on right from TLD.
@@ -341,7 +343,7 @@
"""
return self._stop_chars_right
- def set_stop_chars_right(self, stop_chars):
+ def set_stop_chars_right(self, stop_chars: Set[str]):
"""
Set stop characters for text on right from TLD.
Stop characters are used when determining end of URL.
@@ -357,7 +359,7 @@
self._stop_chars_right = stop_chars
- def get_enclosures(self):
+ def get_enclosures(self) -> Set[Tuple[str, str]]:
"""
Returns set of enclosure pairs that might be used to enclosure URL.
For example brackets (example.com), [example.com], {example.com}
@@ -367,7 +369,7 @@
"""
return self._enclosure
- def add_enclosure(self, left_char, right_char):
+ def add_enclosure(self, left_char: str, right_char: str):
"""
Add new enclosure pair of characters. That and should be removed
when their presence is detected at beginning and end of found URL
@@ -381,7 +383,7 @@
self._after_tld_chars = self._get_after_tld_chars()
- def remove_enclosure(self, left_char, right_char):
+ def remove_enclosure(self, left_char: str, right_char: str):
"""
Remove enclosure pair from set of enclosures.
@@ -397,8 +399,8 @@
self._after_tld_chars = self._get_after_tld_chars()
def _complete_url(
- self, text, tld_pos, tld, check_dns=False, with_schema_only=False
- ):
+ self, text: str, tld_pos: int, tld: str, check_dns=False,
with_schema_only=False
+ ) -> str:
"""
Expand string in both sides to match whole URL.
@@ -486,6 +488,9 @@
# URL should not start with two backslashes
if complete_url.startswith("//"):
complete_url = complete_url[2:]
+ # URL should not start with unreserved characters
+ if complete_url.startswith(("-", ".", "~", "_")):
+ complete_url = complete_url[1:]
if not self._is_domain_valid(
complete_url, tld, check_dns=check_dns,
with_schema_only=with_schema_only
):
@@ -493,7 +498,7 @@
return complete_url
- def _validate_tld_match(self, text, matched_tld, tld_pos):
+ def _validate_tld_match(self, text: str, matched_tld: str, tld_pos: int)
-> bool:
"""
Validate TLD match - tells if at found position is really TLD.
@@ -517,7 +522,9 @@
return False
- def _is_domain_valid(self, url, tld, check_dns=False,
with_schema_only=False):
+ def _is_domain_valid(
+ self, url: str, tld: str, check_dns=False, with_schema_only=False
+ ):
"""
Checks if given URL has valid domain name (ignores subdomains)
@@ -570,6 +577,10 @@
url_parts = uritools.urisplit(url)
# <scheme>://<authority>/<path>?<query>#<fragment>
+ # authority can't start with @
+ if url_parts.authority.startswith('@'):
+ return False
+
# if URI contains user info and schema was automatically added
# the url is probably an email
if url_parts.getuserinfo() and added_schema:
@@ -653,7 +664,7 @@
return True
- def _remove_enclosure_from_url(self, text_url, tld_pos, tld):
+ def _remove_enclosure_from_url(self, text_url: str, tld_pos: int, tld:
str) -> str:
"""
Removes enclosure characters from URL given in text_url.
For example: (example.com) -> example.com
@@ -707,7 +718,7 @@
return new_url
@staticmethod
- def _split_markdown(text_url, tld_pos):
+ def _split_markdown(text_url: str, tld_pos: int) -> str:
"""
Split markdown URL. There is an issue wen Markdown URL is found.
Parsing of the URL does not stop on right place so wrongly found URL
@@ -736,7 +747,8 @@
return text_url
@staticmethod
- def _get_tld_pos(url, tld):
+ # TODO: fix DOC to accomodate to return value
+ def _get_tld_pos(url: str, tld: str) -> int:
"""
Return position of TLD in hostname.
@@ -751,9 +763,11 @@
offset = url.find(host)
return host.rfind(tld) + offset
+ # TODO: move type assertion to be Generator based
+ # found https://stackoverflow.com/a/38423388/14669675
def gen_urls(
- self, text, check_dns=False, get_indices=False, with_schema_only=False
- ):
+ self, text: str, check_dns=False, get_indices=False,
with_schema_only=False
+ ) -> Generator[Union[str, Tuple[str, Tuple[int, int]]], None, None]:
"""
Creates generator over found URLs in given text.
@@ -814,12 +828,12 @@
def find_urls(
self,
- text,
+ text: str,
only_unique=False,
check_dns=False,
get_indices=False,
with_schema_only=False,
- ):
+ ) -> List[Union[str, Tuple[str, Tuple[int, int]]]]:
"""
Find all URLs in given text.
@@ -847,7 +861,7 @@
return list(OrderedDict.fromkeys(urls))
return list(urls)
- result_urls = []
+ result_urls: List[Union[str, Tuple[str, Tuple[int, int]]]] = []
url = next(urls, "")
url_count = 1
while url:
@@ -867,7 +881,7 @@
return list(OrderedDict.fromkeys(result_urls))
return result_urls
- def has_urls(self, text, check_dns=False, with_schema_only=False):
+ def has_urls(self, text: str, check_dns=False, with_schema_only=False) ->
bool:
"""
Checks if text contains any valid URL.
Returns True if text contains at least one URL.
@@ -928,7 +942,8 @@
"""
import argparse
- def get_args():
+ # TODO: add type checking here
+ def get_args() -> Namespace:
"""Parse programs arguments"""
parser = argparse.ArgumentParser(
description="urlextract - prints out all URLs that were "
@@ -1046,10 +1061,10 @@
args.input_file.close()
-def dns_cache_install():
+def dns_cache_install() -> None:
try:
- from dns import resolver as dnspython_resolver_module
- from dns_cache.resolver import ExceptionCachingResolver
+ from dns import resolver as dnspython_resolver_module # type: ignore
+ from dns_cache.resolver import ExceptionCachingResolver # type: ignore
if not dnspython_resolver_module.default_resolver:
dnspython_resolver_module.default_resolver =
ExceptionCachingResolver()
@@ -1058,7 +1073,7 @@
pass
try:
- from dns.resolver import (
+ from dns.resolver import ( # type: ignore
LRUCache,
Resolver,
_resolver,