Re: [PATCH 05/10] parser: deal with headers entirely failing to parse
On 28/06/17 17:48, Daniel Axtens wrote: It turns out that the attempts in clean_header() to convert headers to strings are not guaranteed to work: you can end up with, for example, a base64 decoding error which makes it impossible to determine any header content. In this case, sanitise_header() should return None, and thus clean_header() should return None. We then need to plumb that through. Signed-off-by: Daniel AxtensAll looks good Reviewed-by: Andrew Donnellan --- patchwork/parser.py | 70 + 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/patchwork/parser.py b/patchwork/parser.py index 032af8a7be7c..203e11584504 100644 --- a/patchwork/parser.py +++ b/patchwork/parser.py @@ -23,6 +23,7 @@ from email.header import decode_header from email.header import make_header from email.utils import mktime_tz from email.utils import parsedate_tz +from email.errors import HeaderParseError from fnmatch import fnmatch import logging import re @@ -67,6 +68,13 @@ def sanitise_header(header_contents, header_name=None): then encode the result with make_header() """ +try: +value = decode_header(header_contents) +except HeaderParseError: +# something has gone really wrong with header parsing. +# (e.g. base64 decoding) We probably can't recover, so: +return None + # We have some Py2/Py3 issues here. # # Firstly, the email parser (before we get here) @@ -85,7 +93,6 @@ def sanitise_header(header_contents, header_name=None): # We solve this by catching any Unicode errors, and then manually # handling any interesting headers. -value = decode_header(header_contents) try: header = make_header(value, header_name=header_name, @@ -130,6 +137,9 @@ def clean_header(header): sane_header = sanitise_header(header) +if sane_header is None: +return None + # on Py2, we want to do unicode(), on Py3, str(). # That gets us the decoded, un-wrapped header. if six.PY2: @@ -157,9 +167,12 @@ def find_project_by_header(mail): for header in list_id_headers: if header in mail: +h = clean_header(mail.get(header)) +if not h: +continue for listid_re in listid_res: -match = listid_re.match(clean_header(mail.get(header))) +match = listid_re.match(h) if match: break @@ -205,7 +218,11 @@ def _find_series_by_references(project, mail): Returns: The matching ``Series`` instance, if any """ -for ref in [clean_header(mail.get('Message-Id'))] + find_references(mail): +refs = find_references(mail) +h = clean_header(mail.get('Message-Id')) +if h: +refs = [h] + refs +for ref in refs: try: return SeriesReference.objects.get( msgid=ref, series__project=project).series @@ -274,6 +291,10 @@ def find_series(project, mail): def find_author(mail): from_header = clean_header(mail.get('From')) + +if not from_header: +raise ValueError("Invalid 'From' header") + name, email = (None, None) # tuple of (regex, fn) @@ -320,7 +341,10 @@ def find_author(mail): def find_date(mail): -t = parsedate_tz(clean_header(mail.get('Date', ''))) +h = clean_header(mail.get('Date', '')) +if not h: +return datetime.datetime.utcnow() +t = parsedate_tz(h) if not t: return datetime.datetime.utcnow() return datetime.datetime.utcfromtimestamp(mktime_tz(t)) @@ -331,7 +355,7 @@ def find_headers(mail): for key, value in mail.items()] strings = [('%s: %s' % (key, header.encode())) - for (key, header) in headers] + for (key, header) in headers if header is not None] return '\n'.join(strings) @@ -347,11 +371,16 @@ def find_references(mail): if 'In-Reply-To' in mail: for in_reply_to in mail.get_all('In-Reply-To'): -refs.append(clean_header(in_reply_to).strip()) +r = clean_header(in_reply_to) +if r: +refs.append(r.strip()) if 'References' in mail: for references_header in mail.get_all('References'): -references = clean_header(references_header).split() +h = clean_header(references_header) +if not h: +continue +references = h.split() references.reverse() for ref in references: ref = ref.strip() @@ -573,6 +602,9 @@ def clean_subject(subject, drop_prefixes=None): prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$') subject = clean_header(subject) +if not subject: +raise ValueError("Invalid 'Subject' header") + if drop_prefixes is None: drop_prefixes = []
[PATCH 05/10] parser: deal with headers entirely failing to parse
It turns out that the attempts in clean_header() to convert headers to strings are not guaranteed to work: you can end up with, for example, a base64 decoding error which makes it impossible to determine any header content. In this case, sanitise_header() should return None, and thus clean_header() should return None. We then need to plumb that through. Signed-off-by: Daniel Axtens--- patchwork/parser.py | 70 + 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/patchwork/parser.py b/patchwork/parser.py index 032af8a7be7c..203e11584504 100644 --- a/patchwork/parser.py +++ b/patchwork/parser.py @@ -23,6 +23,7 @@ from email.header import decode_header from email.header import make_header from email.utils import mktime_tz from email.utils import parsedate_tz +from email.errors import HeaderParseError from fnmatch import fnmatch import logging import re @@ -67,6 +68,13 @@ def sanitise_header(header_contents, header_name=None): then encode the result with make_header() """ +try: +value = decode_header(header_contents) +except HeaderParseError: +# something has gone really wrong with header parsing. +# (e.g. base64 decoding) We probably can't recover, so: +return None + # We have some Py2/Py3 issues here. # # Firstly, the email parser (before we get here) @@ -85,7 +93,6 @@ def sanitise_header(header_contents, header_name=None): # We solve this by catching any Unicode errors, and then manually # handling any interesting headers. -value = decode_header(header_contents) try: header = make_header(value, header_name=header_name, @@ -130,6 +137,9 @@ def clean_header(header): sane_header = sanitise_header(header) +if sane_header is None: +return None + # on Py2, we want to do unicode(), on Py3, str(). # That gets us the decoded, un-wrapped header. if six.PY2: @@ -157,9 +167,12 @@ def find_project_by_header(mail): for header in list_id_headers: if header in mail: +h = clean_header(mail.get(header)) +if not h: +continue for listid_re in listid_res: -match = listid_re.match(clean_header(mail.get(header))) +match = listid_re.match(h) if match: break @@ -205,7 +218,11 @@ def _find_series_by_references(project, mail): Returns: The matching ``Series`` instance, if any """ -for ref in [clean_header(mail.get('Message-Id'))] + find_references(mail): +refs = find_references(mail) +h = clean_header(mail.get('Message-Id')) +if h: +refs = [h] + refs +for ref in refs: try: return SeriesReference.objects.get( msgid=ref, series__project=project).series @@ -274,6 +291,10 @@ def find_series(project, mail): def find_author(mail): from_header = clean_header(mail.get('From')) + +if not from_header: +raise ValueError("Invalid 'From' header") + name, email = (None, None) # tuple of (regex, fn) @@ -320,7 +341,10 @@ def find_author(mail): def find_date(mail): -t = parsedate_tz(clean_header(mail.get('Date', ''))) +h = clean_header(mail.get('Date', '')) +if not h: +return datetime.datetime.utcnow() +t = parsedate_tz(h) if not t: return datetime.datetime.utcnow() return datetime.datetime.utcfromtimestamp(mktime_tz(t)) @@ -331,7 +355,7 @@ def find_headers(mail): for key, value in mail.items()] strings = [('%s: %s' % (key, header.encode())) - for (key, header) in headers] + for (key, header) in headers if header is not None] return '\n'.join(strings) @@ -347,11 +371,16 @@ def find_references(mail): if 'In-Reply-To' in mail: for in_reply_to in mail.get_all('In-Reply-To'): -refs.append(clean_header(in_reply_to).strip()) +r = clean_header(in_reply_to) +if r: +refs.append(r.strip()) if 'References' in mail: for references_header in mail.get_all('References'): -references = clean_header(references_header).split() +h = clean_header(references_header) +if not h: +continue +references = h.split() references.reverse() for ref in references: ref = ref.strip() @@ -573,6 +602,9 @@ def clean_subject(subject, drop_prefixes=None): prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$') subject = clean_header(subject) +if not subject: +raise ValueError("Invalid 'Subject' header") + if drop_prefixes is None: drop_prefixes = [] else: @@ -610,7 +642,11 @@ def subject_check(subject): """Determine if a mail is a reply."""