tags 862004 + moreinfo
thanks
Hello Manolo,
thank you for spending your time helping to make Debian better with
this bug report.
I have checked your issue and have a quick fix for it.
Please can you test it?
Therefore you must change your setup (add the lines Accept and
UserAgent):
{'shortname': 'WMO Library',
'type': 'html',
'uri':
'https://library.wmo.int/opac/index.php?lvl=infopages=en_UK=1',
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0)
Gecko/20100101 Firefox/45.0',
'contentxpath': '//*[@id="overview"]/tbody/tr[1]/td[1]'
}
backup the original cp /usr/share/mwc/mwc.py /usr/share/mwc/mwc.py.org ,
copy the attached mwc.py to /usr/share/mwc
and test it.
Many thanks!
CU
Jörg
--
New:
GPG Fingerprint: 63E0 075F C8D4 3ABB 35AB 30EE 09F8 9F3C 8CA1 D25D
GPG key (long) : 09F89F3C8CA1D25D
GPG Key: 8CA1D25D
CAcert Key S/N : 0E:D4:56
Old pgp Key: BE581B6E (revoked since 2014-12-31).
Jörg Frings-Fürst
D-54470 Lieser
Threema: SYR8SJXB
Wire: @joergfringsfuerst
IRC: j_...@freenode.net
j_...@oftc.net
My wish list:
- Please send me a picture from the nature at your home.
#!/usr/bin/python3
# Copyright: (2013-2014) Michael Till Beck
# License: GPL-2.0+
import urllib.request, urllib.error, urllib.parse
import urllib.parse
from lxml import etree
from cssselect import GenericTranslator
import re
import io
import smtplib
from email.mime.text import MIMEText
from email.header import Header
from urllib.parse import urljoin
import os
import sys
import getopt
import traceback
import syslog
import subprocess
import time
from time import strftime
import random
import importlib
config = None
defaultEncoding = 'utf-8'
maxTitleLength = 150
# this is how an empty feed looks like
emptyfeed = """
MailWebsiteChanges Feed
https://github.com/Debianguru/MailWebsiteChanges
MailWebsiteChanges Feed
"""
# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
cmdscheme = 'cmd://'
mailsession = None
# translates all relative URIs found in trees to absolute URIs
def toAbsoluteURIs(trees, baseuri):
for tree in trees:
for uriAttribute in uriAttributes:
tags = tree.xpath(uriAttribute[0])
for tag in tags:
if tag.attrib.get(uriAttribute[1]) != None:
if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
def parseSite(site):
file, content, titles, warning = None, None, None, None
uri = site['uri']
contenttype = site.get('type', 'html')
contentregex = site.get('contentregex', '')
titleregex = site.get('titleregex', '')
UserAgent = site.get('User-agent', '')
Accept = site.get('Accept', '')
enc = site.get('encoding', defaultEncoding)
contentxpath = site.get('contentxpath', '')
if contentxpath == '' and site.get('contentcss', '') != '':
# CSS
contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
titlexpath = site.get('titlexpath', '')
if titlexpath == '' and site.get('titlecss', '') != '':
titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))
try:
if uri.startswith(cmdscheme):
# run command and retrieve output
process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
file = process.stdout
else:
# open website
req = urllib.request.Request(uri)
if UserAgent != '':
req.add_header('User-agent', UserAgent)
if Accept != '':
req.add_header('Accept', Accept)
file = urllib.request.urlopen(req)
if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
contents = [file.read().decode(enc)]
titles = []
else:
baseuri = uri
if contenttype == 'html':
parser = etree.HTMLParser(encoding=enc)
else:
parser = etree.XMLParser(recover=True, encoding=enc)
tree = etree.parse(file, parser)
# xpath
contentresult = tree.xpath(contentxpath) if