Hello community, here is the log from the commit of package urlwatch for openSUSE:Factory checked in at 2018-05-18 14:28:26 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/urlwatch (Old) and /work/SRC/openSUSE:Factory/.urlwatch.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "urlwatch" Fri May 18 14:28:26 2018 rev:9 rq:610232 version:2.10 Changes: -------- --- /work/SRC/openSUSE:Factory/urlwatch/urlwatch.changes 2018-04-01 17:27:50.488663884 +0200 +++ /work/SRC/openSUSE:Factory/.urlwatch.new/urlwatch.changes 2018-05-18 14:28:32.252973402 +0200 @@ -1,0 +2,9 @@ +Fri May 18 07:33:54 UTC 2018 - [email protected] + +- Update to 2.10: + * File editing: Fix issue when $EDITOR contains spaces + * Browser: Add support for browser jobs using requests-html + * Retry: Add support for optional retry count in job list + * HTTP: Add support for specifying optional headers + +------------------------------------------------------------------- Old: ---- urlwatch-2.9.tar.gz New: ---- urlwatch-2.10.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ urlwatch.spec ++++++ --- /var/tmp/diff_new_pack.BsJ2X9/_old 2018-05-18 14:28:32.700956963 +0200 +++ /var/tmp/diff_new_pack.BsJ2X9/_new 2018-05-18 14:28:32.704956816 +0200 @@ -17,7 +17,7 @@ Name: urlwatch -Version: 2.9 +Version: 2.10 Release: 0 Summary: A tool for monitoring webpages for updates License: BSD-3-Clause ++++++ urlwatch-2.9.tar.gz -> urlwatch-2.10.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/ChangeLog new/urlwatch-2.10/ChangeLog --- old/urlwatch-2.9/ChangeLog 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/ChangeLog 2018-05-17 23:05:34.000000000 +0200 @@ -193,7 +193,7 @@ * Do not copy example files if they do not exist * Handle SIGPIPE (fixes #77) -2016-12-04 Thomas Perl <thp.io/about> +2016-12-04 Thomas Perl <thp.io/about> [2.6] * New filters: sha1sum, hexdump, element-by-class * New reporters: pushbullet (by R0nd); mailgun (by lechuckcaptain) * Improved filters: BeautifulSoup support for html2txt (by lechuckcaptain) @@ -204,7 +204,7 @@ * Issue #118: Fix match filters for missing keys * Small fixes by: Jakub Wilk, Marc Urben, Adam Dobrawy and Louis Sautier -2017-11-08 Thomas Perl <thp.io/about> +2017-11-08 Thomas Perl <thp.io/about> [2.7] * Issue #127: Fix error reporting * ElementsByAttribute: look for matching tag in handle_endtag (by Gaetan Leurent) * Paths: Add XDG_CONFIG_DIR support (by Jelle van der Waa) @@ -214,15 +214,22 @@ * Filtering: style (by gvandenbroucke), tag (by cmichi) * New reporter: Telegram support (by gvandenbroucke) -2018-01-28 Thomas Perl <[email protected]> +2018-01-28 Thomas Perl <[email protected]> [2.8] * Documentation: Mention appdirs (by e-dschungel) * SMTP: Fix handling of missing user field (by e-dschungel) * Manpage: Fix documentation of XDG environment variables (by Jelle van der Waa) * Unit tests: Fix imports for out-of-source-tree tests (by Maxime Werlen) -2018-03-24 Thomas Perl <[email protected]> +2018-03-24 Thomas Perl <[email protected]> [2.9] * Pushover: Device and sound attribute (by Tobias Haupenthal) * XDG: Move cache file to XDG_CACHE_DIR (by Maxime Werlen) * E-Mail: Add support for --smtp-login and document GMail SMTP usage * Cleanups: Fix out-of-date debug message, use https (by Jakub Wilk) * Migration: Unconditionally migrate urlwatch 1.x cache dirs (Fixes #206) + +2018-05-17 Thomas Perl <[email protected]> [2.10] + * File editing: Fix issue when $EDITOR contains spaces (Fixes #220) + * Browser: Add support for browser jobs using requests-html (Fixes #215) + * Retry: Add support for optional retry count in job list (by cmichi, fixes #235) + * HTTP: Add support for specifying optional headers (by Tero Mononen) + * ChangeLog: Add versions to recent ChangeLog entries (Fixes #235) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/README.md new/urlwatch-2.10/README.md --- old/urlwatch-2.9/README.md 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/README.md 2018-05-17 23:05:34.000000000 +0200 @@ -41,6 +41,10 @@ `python3 -m pip install pushbullet.py` +For optional support for the "browser" job kind, Requests-HTML is needed: + +`python3 -m pip install requests-html` + For unit tests, you also need to install pycodestyle: `python3 -m pip install pycodestyle` @@ -146,6 +150,24 @@ enabled: true ``` +BROWSER +------- + +If the webpage you are trying to watch runs client-side JavaScript to +render the page, [Requests-HTML](http://html.python-requests.org) can +now be used to render the page in a headless Chromium instance first +and then use the HTML of the resulting page. + +Use the `browser` kind in the configuration and the `navigate` key to set the +URL to retrieve. note that the normal `url` job keys are not supported +for the `browser` job types at the moment, for example: + +```yaml +kind: browser +name: "A Page With JavaScript" +navigate: http://example.org/ +``` + E-MAIL VIA GMAIL SMTP --------------------- diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/lib/urlwatch/__init__.py new/urlwatch-2.10/lib/urlwatch/__init__.py --- old/urlwatch-2.9/lib/urlwatch/__init__.py 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/lib/urlwatch/__init__.py 2018-05-17 23:05:34.000000000 +0200 @@ -12,5 +12,5 @@ __author__ = 'Thomas Perl <[email protected]>' __license__ = 'BSD' __url__ = 'https://thp.io/2008/urlwatch/' -__version__ = '2.9' +__version__ = '2.10' __user_agent__ = '%s/%s (+https://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/lib/urlwatch/command.py new/urlwatch-2.10/lib/urlwatch/command.py --- old/urlwatch-2.9/lib/urlwatch/command.py 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/lib/urlwatch/command.py 2018-05-17 23:05:34.000000000 +0200 @@ -32,13 +32,12 @@ import logging import os import shutil -import subprocess import sys from .filters import FilterBase from .jobs import JobBase from .reporters import ReporterBase -from .util import atomic_rename +from .util import atomic_rename, edit_file from .mailer import set_password, have_password logger = logging.getLogger(__name__) @@ -51,14 +50,6 @@ self.urlwatch_config = urlwatcher.urlwatch_config def edit_hooks(self): - - editor = os.environ.get('EDITOR', None) - if editor is None: - editor = os.environ.get('VISUAL', None) - if editor is None: - print('Please set $VISUAL or $EDITOR.') - return 1 - fn_base, fn_ext = os.path.splitext(self.urlwatch_config.hooks) hooks_edit = fn_base + '.edit' + fn_ext try: @@ -67,10 +58,12 @@ elif self.urlwatch_config.hooks_py_example is not None and os.path.exists( self.urlwatch_config.hooks_py_example): shutil.copy(self.urlwatch_config.hooks_py_example, hooks_edit) - subprocess.check_call([editor, hooks_edit]) + edit_file(hooks_edit) imp.load_source('hooks', hooks_edit) atomic_rename(hooks_edit, self.urlwatch_config.hooks) print('Saving edit changes in', self.urlwatch_config.hooks) + except SystemExit: + raise except Exception as e: print('Parsing failed:') print('======') diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/lib/urlwatch/handler.py new/urlwatch-2.10/lib/urlwatch/handler.py --- old/urlwatch-2.9/lib/urlwatch/handler.py 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/lib/urlwatch/handler.py 2018-05-17 23:05:34.000000000 +0200 @@ -34,6 +34,7 @@ import traceback from .filters import FilterBase +from .jobs import NotModifiedError from .reporters import ReporterBase logger = logging.getLogger(__name__) @@ -49,12 +50,13 @@ self.timestamp = None self.exception = None self.traceback = None + self.tries = 0 def load(self): - self.old_data, self.timestamp = self.cache_storage.load(self.job, self.job.get_guid()) + self.old_data, self.timestamp, self.tries = self.cache_storage.load(self.job, self.job.get_guid()) def save(self): - self.cache_storage.save(self.job, self.job.get_guid(), self.new_data, time.time()) + self.cache_storage.save(self.job, self.job.get_guid(), self.new_data, time.time(), self.tries) def process(self): logger.info('Processing: %s', self.job) @@ -82,9 +84,14 @@ subfilter = None data = FilterBase.process(filter_kind, subfilter, self, data) self.new_data = data + self.tries = 0 + except Exception as e: self.exception = e self.traceback = traceback.format_exc() + if not isinstance(e, NotModifiedError): + self.tries += 1 + logger.debug('Increasing number of tries to %i for %s', self.tries, self.job) return self diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/lib/urlwatch/jobs.py new/urlwatch-2.10/lib/urlwatch/jobs.py --- old/urlwatch-2.9/lib/urlwatch/jobs.py 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/lib/urlwatch/jobs.py 2018-05-17 23:05:34.000000000 +0200 @@ -109,8 +109,8 @@ if 'kind' not in data: # Try to auto-detect the kind of job based on the available keys kinds = [subclass.__kind__ for subclass in list(cls.__subclasses__.values()) - if all(required in data for required in subclass.__required__) and - not any(key not in subclass.__required__ and key not in subclass.__optional__ for key in data)] + if all(required in data for required in subclass.__required__) and not any( + key not in subclass.__required__ and key not in subclass.__optional__ for key in data)] if len(kinds) == 1: kind = kinds[0] @@ -146,7 +146,7 @@ class Job(JobBase): __required__ = () - __optional__ = ('name', 'filter') + __optional__ = ('name', 'filter', 'max_tries') def pretty_name(self): return self.name if self.name else self.get_location() @@ -179,7 +179,8 @@ __kind__ = 'url' __required__ = ('url',) - __optional__ = ('cookies', 'data', 'method', 'ssl_no_verify', 'ignore_cached', 'http_proxy', 'https_proxy') + __optional__ = ('cookies', 'data', 'method', 'ssl_no_verify', 'ignore_cached', 'http_proxy', 'https_proxy', + 'headers') CHARSET_RE = re.compile('text/(html|plain); charset=([^;]*)') @@ -221,6 +222,9 @@ logger.info('Using local filesystem (%s URI scheme)', file_scheme) return open(self.url[len(file_scheme):], 'rt').read() + if self.headers: + self.add_custom_headers(headers) + response = requests.request(url=self.url, data=self.data, headers=headers, @@ -252,3 +256,30 @@ return response.content.decode('ascii', 'ignore') return response.text + + def add_custom_headers(self, headers): + """ + Adds custom request headers from the job list (URLs) to the pre-filled dictionary `headers`. + Pre-filled values of conflicting header keys (case-insensitive) are overwritten by custom value. + """ + headers_to_remove = [x for x in headers if x.lower() in [y.lower() for y in self.headers]] + for header in headers_to_remove: + headers.pop(header, None) + headers.update(self.headers) + + +class BrowserJob(Job): + """Retrieve an URL, emulating a real web browser""" + + __kind__ = 'browser' + + __required__ = ('navigate',) + + def get_location(self): + return self.navigate + + def retrieve(self, job_state): + from requests_html import HTMLSession + session = HTMLSession() + response = session.get(self.navigate) + return response.html.html diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/lib/urlwatch/storage.py new/urlwatch-2.10/lib/urlwatch/storage.py --- old/urlwatch-2.9/lib/urlwatch/storage.py 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/lib/urlwatch/storage.py 2018-05-17 23:05:34.000000000 +0200 @@ -35,15 +35,12 @@ from abc import ABCMeta, abstractmethod import shutil - -import subprocess -import shlex import yaml import json import minidb import logging -from .util import atomic_rename +from .util import atomic_rename, edit_file from .jobs import JobBase, UrlJob, ShellJob logger = logging.getLogger(__name__) @@ -164,14 +161,6 @@ ... def edit(self, example_file=None): - - editor = os.environ.get('EDITOR', None) - if editor is None: - editor = os.environ.get('VISUAL', None) - if editor is None: - print('Please set $VISUAL or $EDITOR.') - return 1 - fn_base, fn_ext = os.path.splitext(self.filename) file_edit = fn_base + '.edit' + fn_ext @@ -182,13 +171,13 @@ while True: try: - editor = shlex.split(editor) - editor.append(file_edit) - subprocess.check_call(editor) + edit_file(file_edit) # Check if we can still parse it if self.parse is not None: self.parse(file_edit) break # stop if no exception on parser + except SystemExit: + raise except Exception as e: print('Parsing failed:') print('======') @@ -370,7 +359,7 @@ ... @abstractmethod - def save(self, job, guid, data, timestamp): + def save(self, job, guid, data, timestamp, tries): ... @abstractmethod @@ -383,12 +372,12 @@ def backup(self): for guid in self.get_guids(): - data, timestamp = self.load(None, guid) - yield guid, data, timestamp + data, timestamp, tries = self.load(None, guid) + yield guid, data, timestamp, tries def restore(self, entries): - for guid, data, timestamp in entries: - self.save(None, guid, data, timestamp) + for guid, data, timestamp, tries in entries: + self.save(None, guid, data, timestamp, tries) def gc(self, known_guids): for guid in set(self.get_guids()) - set(known_guids): @@ -453,6 +442,7 @@ guid = str timestamp = int data = str + tries = int class CacheMiniDBStorage(CacheStorage): @@ -474,15 +464,15 @@ return (guid for guid, in CacheEntry.query(self.db, minidb.Function('distinct', CacheEntry.c.guid))) def load(self, job, guid): - for data, timestamp in CacheEntry.query(self.db, CacheEntry.c.data // CacheEntry.c.timestamp, - order_by=CacheEntry.c.timestamp.desc, - where=CacheEntry.c.guid == guid, limit=1): - return data, timestamp + for data, timestamp, tries in CacheEntry.query(self.db, CacheEntry.c.data // CacheEntry.c.timestamp // CacheEntry.c.tries, + order_by=minidb.columns(CacheEntry.c.timestamp.desc, CacheEntry.c.tries.desc), + where=CacheEntry.c.guid == guid, limit=1): + return data, timestamp, tries - return None, None + return None, None, 0 - def save(self, job, guid, data, timestamp): - self.db.save(CacheEntry(guid=guid, timestamp=timestamp, data=data)) + def save(self, job, guid, data, timestamp, tries): + self.db.save(CacheEntry(guid=guid, timestamp=timestamp, data=data, tries=tries)) self.db.commit() def delete(self, guid): diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/lib/urlwatch/util.py new/urlwatch-2.10/lib/urlwatch/util.py --- old/urlwatch-2.9/lib/urlwatch/util.py 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/lib/urlwatch/util.py 2018-05-17 23:05:34.000000000 +0200 @@ -31,6 +31,8 @@ import logging import os import platform +import subprocess +import shlex logger = logging.getLogger(__name__) @@ -87,3 +89,13 @@ os.remove(new_old_filename) else: os.rename(old_filename, new_filename) + + +def edit_file(filename): + editor = os.environ.get('EDITOR', None) + if not editor: + editor = os.environ.get('VISUAL', None) + if not editor: + raise SystemExit('Please set $VISUAL or $EDITOR.') + + subprocess.check_call(shlex.split(editor) + [filename]) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/lib/urlwatch/worker.py new/urlwatch-2.10/lib/urlwatch/worker.py --- old/urlwatch-2.9/lib/urlwatch/worker.py 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/lib/urlwatch/worker.py 2018-05-17 23:05:34.000000000 +0200 @@ -60,16 +60,30 @@ (JobState(cache_storage, job) for job in jobs)): logger.debug('Job finished: %s', job_state.job) + if not job_state.job.max_tries: + max_tries = 0 + else: + max_tries = job_state.job.max_tries + logger.debug('Using max_tries of %i for %s', max_tries, job_state.job) + if job_state.exception is not None: if isinstance(job_state.exception, NotModifiedError): logger.info('Job %s has not changed (HTTP 304)', job_state.job) report.unchanged(job_state) - elif isinstance(job_state.exception, requests.exceptions.RequestException): - # Instead of a full traceback, just show the HTTP error - job_state.traceback = str(job_state.exception) - report.error(job_state) - else: - report.error(job_state) + elif job_state.tries < max_tries: + logger.debug('This was try %i of %i for job %s', job_state.tries, + max_tries, job_state.job) + job_state.save() + elif job_state.tries >= max_tries: + logger.debug('We are now at %i tries ', job_state.tries) + job_state.save() + if isinstance(job_state.exception, requests.exceptions.RequestException): + # Instead of a full traceback, just show the HTTP error + job_state.traceback = str(job_state.exception) + report.error(job_state) + else: + report.error(job_state) + elif job_state.old_data is not None: if job_state.old_data.splitlines() != job_state.new_data.splitlines(): report.changed(job_state) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/test/data/invalid-url.yaml new/urlwatch-2.10/test/data/invalid-url.yaml --- old/urlwatch-2.9/test/data/invalid-url.yaml 1970-01-01 01:00:00.000000000 +0100 +++ new/urlwatch-2.10/test/data/invalid-url.yaml 2018-05-17 23:05:34.000000000 +0200 @@ -0,0 +1,4 @@ +name: "invalid url" +url: "https://invalid" +max_tries: 2 +--- diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/urlwatch-2.9/test/test_handler.py new/urlwatch-2.10/test/test_handler.py --- old/urlwatch-2.9/test/test_handler.py 2018-03-24 13:18:25.000000000 +0100 +++ new/urlwatch-2.10/test/test_handler.py 2018-05-17 23:05:34.000000000 +0200 @@ -139,3 +139,76 @@ 'unknown_key': 123, 'name': 'hoho', }) + + +def prepare_retry_test(): + urls = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'test', 'data', 'invalid-url.yaml') + config = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml') + cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db') + hooks = '' + + config_storage = YamlConfigStorage(config) + cache_storage = CacheMiniDBStorage(cache) + urls_storage = UrlsYaml(urls) + + urlwatch_config = TestConfig(config, urls, cache, hooks, True) + urlwatcher = Urlwatch(urlwatch_config, config_storage, cache_storage, urls_storage) + + return urlwatcher, cache_storage + + +@with_setup(teardown=teardown_func) +def test_number_of_tries_in_cache_is_increased(): + urlwatcher, cache_storage = prepare_retry_test() + job = urlwatcher.jobs[0] + old_data, timestamp, tries = cache_storage.load(job, job.get_guid()) + assert tries == 0 + + urlwatcher.run_jobs() + urlwatcher.run_jobs() + + job = urlwatcher.jobs[0] + old_data, timestamp, tries = cache_storage.load(job, job.get_guid()) + + assert tries == 2 + assert urlwatcher.report.job_states[-1].verb == 'error' + + +@with_setup(teardown=teardown_func) +def test_report_error_when_out_of_tries(): + urlwatcher, cache_storage = prepare_retry_test() + + job = urlwatcher.jobs[0] + old_data, timestamp, tries = cache_storage.load(job, job.get_guid()) + assert tries == 0 + + urlwatcher.run_jobs() + urlwatcher.run_jobs() + + report = urlwatcher.report + assert report.job_states[-1].verb == 'error' + + +@with_setup(teardown=teardown_func) +def test_reset_tries_to_zero_when_successful(): + urlwatcher, cache_storage = prepare_retry_test() + + job = urlwatcher.jobs[0] + old_data, timestamp, tries = cache_storage.load(job, job.get_guid()) + assert tries == 0 + + urlwatcher.run_jobs() + + job = urlwatcher.jobs[0] + old_data, timestamp, tries = cache_storage.load(job, job.get_guid()) + assert tries == 1 + + # use an url that definitely exists + job = urlwatcher.jobs[0] + job.url = 'file://' + os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml') + + urlwatcher.run_jobs() + + job = urlwatcher.jobs[0] + old_data, timestamp, tries = cache_storage.load(job, job.get_guid()) + assert tries == 0
