Re: [PATCH 05/10] parser: deal with headers entirely failing to parse

2017-06-28 Thread Andrew Donnellan



On 28/06/17 17:48, Daniel Axtens wrote:

It turns out that the attempts in clean_header() to convert
headers to strings are not guaranteed to work: you can end up with,
for example, a base64 decoding error which makes it impossible
to determine any header content.

In this case, sanitise_header() should return None, and thus
clean_header() should return None. We then need to plumb that
through.

Signed-off-by: Daniel Axtens 


All looks good

Reviewed-by: Andrew Donnellan 


---
 patchwork/parser.py | 70 +
 1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/patchwork/parser.py b/patchwork/parser.py
index 032af8a7be7c..203e11584504 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -23,6 +23,7 @@ from email.header import decode_header
 from email.header import make_header
 from email.utils import mktime_tz
 from email.utils import parsedate_tz
+from email.errors import HeaderParseError
 from fnmatch import fnmatch
 import logging
 import re
@@ -67,6 +68,13 @@ def sanitise_header(header_contents, header_name=None):
 then encode the result with make_header()
 """

+try:
+value = decode_header(header_contents)
+except HeaderParseError:
+# something has gone really wrong with header parsing.
+# (e.g. base64 decoding) We probably can't recover, so:
+return None
+
 # We have some Py2/Py3 issues here.
 #
 # Firstly, the email parser (before we get here)
@@ -85,7 +93,6 @@ def sanitise_header(header_contents, header_name=None):
 # We solve this by catching any Unicode errors, and then manually
 # handling any interesting headers.

-value = decode_header(header_contents)
 try:
 header = make_header(value,
  header_name=header_name,
@@ -130,6 +137,9 @@ def clean_header(header):

 sane_header = sanitise_header(header)

+if sane_header is None:
+return None
+
 # on Py2, we want to do unicode(), on Py3, str().
 # That gets us the decoded, un-wrapped header.
 if six.PY2:
@@ -157,9 +167,12 @@ def find_project_by_header(mail):

 for header in list_id_headers:
 if header in mail:
+h = clean_header(mail.get(header))
+if not h:
+continue

 for listid_re in listid_res:
-match = listid_re.match(clean_header(mail.get(header)))
+match = listid_re.match(h)
 if match:
 break

@@ -205,7 +218,11 @@ def _find_series_by_references(project, mail):
 Returns:
 The matching ``Series`` instance, if any
 """
-for ref in [clean_header(mail.get('Message-Id'))] + find_references(mail):
+refs = find_references(mail)
+h = clean_header(mail.get('Message-Id'))
+if h:
+refs = [h] + refs
+for ref in refs:
 try:
 return SeriesReference.objects.get(
 msgid=ref, series__project=project).series
@@ -274,6 +291,10 @@ def find_series(project, mail):

 def find_author(mail):
 from_header = clean_header(mail.get('From'))
+
+if not from_header:
+raise ValueError("Invalid 'From' header")
+
 name, email = (None, None)

 # tuple of (regex, fn)
@@ -320,7 +341,10 @@ def find_author(mail):


 def find_date(mail):
-t = parsedate_tz(clean_header(mail.get('Date', '')))
+h = clean_header(mail.get('Date', ''))
+if not h:
+return datetime.datetime.utcnow()
+t = parsedate_tz(h)
 if not t:
 return datetime.datetime.utcnow()
 return datetime.datetime.utcfromtimestamp(mktime_tz(t))
@@ -331,7 +355,7 @@ def find_headers(mail):
for key, value in mail.items()]

 strings = [('%s: %s' % (key, header.encode()))
-   for (key, header) in headers]
+   for (key, header) in headers if header is not None]

 return '\n'.join(strings)

@@ -347,11 +371,16 @@ def find_references(mail):

 if 'In-Reply-To' in mail:
 for in_reply_to in mail.get_all('In-Reply-To'):
-refs.append(clean_header(in_reply_to).strip())
+r = clean_header(in_reply_to)
+if r:
+refs.append(r.strip())

 if 'References' in mail:
 for references_header in mail.get_all('References'):
-references = clean_header(references_header).split()
+h = clean_header(references_header)
+if not h:
+continue
+references = h.split()
 references.reverse()
 for ref in references:
 ref = ref.strip()
@@ -573,6 +602,9 @@ def clean_subject(subject, drop_prefixes=None):
 prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$')
 subject = clean_header(subject)

+if not subject:
+raise ValueError("Invalid 'Subject' header")
+
 if drop_prefixes is None:
 drop_prefixes = []
  

[PATCH 05/10] parser: deal with headers entirely failing to parse

2017-06-28 Thread Daniel Axtens
It turns out that the attempts in clean_header() to convert
headers to strings are not guaranteed to work: you can end up with,
for example, a base64 decoding error which makes it impossible
to determine any header content.

In this case, sanitise_header() should return None, and thus
clean_header() should return None. We then need to plumb that
through.

Signed-off-by: Daniel Axtens 
---
 patchwork/parser.py | 70 +
 1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/patchwork/parser.py b/patchwork/parser.py
index 032af8a7be7c..203e11584504 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -23,6 +23,7 @@ from email.header import decode_header
 from email.header import make_header
 from email.utils import mktime_tz
 from email.utils import parsedate_tz
+from email.errors import HeaderParseError
 from fnmatch import fnmatch
 import logging
 import re
@@ -67,6 +68,13 @@ def sanitise_header(header_contents, header_name=None):
 then encode the result with make_header()
 """
 
+try:
+value = decode_header(header_contents)
+except HeaderParseError:
+# something has gone really wrong with header parsing.
+# (e.g. base64 decoding) We probably can't recover, so:
+return None
+
 # We have some Py2/Py3 issues here.
 #
 # Firstly, the email parser (before we get here)
@@ -85,7 +93,6 @@ def sanitise_header(header_contents, header_name=None):
 # We solve this by catching any Unicode errors, and then manually
 # handling any interesting headers.
 
-value = decode_header(header_contents)
 try:
 header = make_header(value,
  header_name=header_name,
@@ -130,6 +137,9 @@ def clean_header(header):
 
 sane_header = sanitise_header(header)
 
+if sane_header is None:
+return None
+
 # on Py2, we want to do unicode(), on Py3, str().
 # That gets us the decoded, un-wrapped header.
 if six.PY2:
@@ -157,9 +167,12 @@ def find_project_by_header(mail):
 
 for header in list_id_headers:
 if header in mail:
+h = clean_header(mail.get(header))
+if not h:
+continue
 
 for listid_re in listid_res:
-match = listid_re.match(clean_header(mail.get(header)))
+match = listid_re.match(h)
 if match:
 break
 
@@ -205,7 +218,11 @@ def _find_series_by_references(project, mail):
 Returns:
 The matching ``Series`` instance, if any
 """
-for ref in [clean_header(mail.get('Message-Id'))] + find_references(mail):
+refs = find_references(mail)
+h = clean_header(mail.get('Message-Id'))
+if h:
+refs = [h] + refs
+for ref in refs:
 try:
 return SeriesReference.objects.get(
 msgid=ref, series__project=project).series
@@ -274,6 +291,10 @@ def find_series(project, mail):
 
 def find_author(mail):
 from_header = clean_header(mail.get('From'))
+
+if not from_header:
+raise ValueError("Invalid 'From' header")
+
 name, email = (None, None)
 
 # tuple of (regex, fn)
@@ -320,7 +341,10 @@ def find_author(mail):
 
 
 def find_date(mail):
-t = parsedate_tz(clean_header(mail.get('Date', '')))
+h = clean_header(mail.get('Date', ''))
+if not h:
+return datetime.datetime.utcnow()
+t = parsedate_tz(h)
 if not t:
 return datetime.datetime.utcnow()
 return datetime.datetime.utcfromtimestamp(mktime_tz(t))
@@ -331,7 +355,7 @@ def find_headers(mail):
for key, value in mail.items()]
 
 strings = [('%s: %s' % (key, header.encode()))
-   for (key, header) in headers]
+   for (key, header) in headers if header is not None]
 
 return '\n'.join(strings)
 
@@ -347,11 +371,16 @@ def find_references(mail):
 
 if 'In-Reply-To' in mail:
 for in_reply_to in mail.get_all('In-Reply-To'):
-refs.append(clean_header(in_reply_to).strip())
+r = clean_header(in_reply_to)
+if r:
+refs.append(r.strip())
 
 if 'References' in mail:
 for references_header in mail.get_all('References'):
-references = clean_header(references_header).split()
+h = clean_header(references_header)
+if not h:
+continue
+references = h.split()
 references.reverse()
 for ref in references:
 ref = ref.strip()
@@ -573,6 +602,9 @@ def clean_subject(subject, drop_prefixes=None):
 prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$')
 subject = clean_header(subject)
 
+if not subject:
+raise ValueError("Invalid 'Subject' header")
+
 if drop_prefixes is None:
 drop_prefixes = []
 else:
@@ -610,7 +642,11 @@ def subject_check(subject):
 """Determine if a mail is a reply."""