On Fri, 2022-05-06 at 18:49 +0100, Stephen Finucane wrote: > We recently started stripping comments and folding white space from the > In-Reply-To and References headers. Do so also for the Message-ID > header. > > Signed-off-by: Stephen Finucane <step...@that.guru> > Related: #399 > --- > patchwork/parser.py | 43 ++++++++++++++++++++++++++-------- > patchwork/tests/test_parser.py | 32 +++++++++++++++++++++++++ > 2 files changed, 65 insertions(+), 10 deletions(-) > > diff --git patchwork/parser.py patchwork/parser.py > index 17cc2325..f219f466 100644 > --- patchwork/parser.py > +++ patchwork/parser.py > @@ -236,15 +236,14 @@ def _find_series_by_references(project, mail): > name, prefixes = clean_subject(subject, [project.linkname]) > version = parse_version(name, prefixes) > > - refs = find_references(mail) > - h = clean_header(mail.get('Message-Id')) > - if h: > - refs = [h] + refs > + msg_id = find_message_id(mail) > + refs = [msg_id] + find_references(mail) > > for ref in refs: > try: > series = SeriesReference.objects.get( > - msgid=ref[:255], project=project).series > + msgid=ref[:255], project=project, > + ).series > > if series.version != version: > # if the versions don't match, at least make sure these were > @@ -473,6 +472,34 @@ def find_headers(mail): > return '\n'.join(strings) > > > +def find_message_id(mail): > + """Extract the 'message-id' headers from a given mail and validate it. > + > + The validation here is simply checking that the Message-ID is correctly > + formatted per RFC-2822. However, even if it's not we'll attempt to use > what > + we're given because a patch tracked in Patchwork with janky threading is > + better than no patch whatsoever. > + """ > + header = clean_header(mail.get('Message-Id')) > + if not header: > + raise ValueError("Broken 'Message-Id' header") > + > + msgid = _msgid_re.search(header) > + if msgid: > + msgid = msgid.group(0) > + else: > + # This is only info level since the admin likely can't do anything > + # about this > + logger.info( > + "Malformed 'Message-Id' header. The 'msg-id' component should be > " > + "surrounded by angle brackets. Saving raw header. This may " > + "include comments and extra comments." > + )
I wonder if I should do this also for the 'In-Reply-To' field? I can't do it easily for the 'References' field since there's zero way to delineate things if I do, but that shouldn't matter since as long as we don't drop patches and things appear roughly in order, we don't really need the 'References' field: 'In-Reply-To' is enough to find *a* parent. Stephen > + msgid = header > + > + return msgid[:255] > + > + > def find_references(mail): > """Construct a list of possible reply message ids. > > @@ -1062,11 +1089,7 @@ def parse_mail(mail, list_id=None): > > # parse metadata > > - msgid = clean_header(mail.get('Message-Id')) > - if not msgid: > - raise ValueError("Broken 'Message-Id' header") > - msgid = msgid[:255] > - > + msgid = find_message_id(mail) > subject = mail.get('Subject') > name, prefixes = clean_subject(subject, [project.linkname]) > is_comment = subject_check(subject) > diff --git patchwork/tests/test_parser.py patchwork/tests/test_parser.py > index f65ad4b1..980a8afb 100644 > --- patchwork/tests/test_parser.py > +++ patchwork/tests/test_parser.py > @@ -1265,6 +1265,38 @@ class DuplicateMailTest(TestCase): > self.assertEqual(Cover.objects.count(), 1) > > > +class TestFindMessageID(TestCase): > + > + def test_find_message_id__missing_header(self): > + email = create_email('test') > + del email['Message-Id'] > + email['Message-Id'] = '' > + > + with self.assertRaises(ValueError) as cm: > + parser.find_message_id(email) > + self.assertIn("Broken 'Message-Id' header", str(cm.exeception)) > + > + def test_find_message_id__header_with_comments(self): > + """Test that we strip comments from the Message-ID field.""" > + message_id = '<xnzgy1de8d....@rhel8.vm> (message ID with a comment)' > + email = create_email('test', msgid=message_id) > + > + expected = '<xnzgy1de8d....@rhel8.vm>' > + actual = parser.find_message_id(email) > + > + self.assertEqual(expected, actual) > + > + def test_find_message_id__invalid_header_fallback(self): > + """Test that we accept badly formatted Message-ID fields.""" > + message_id = '5899d592-8c87-47d9-92b6-d34260ce1...@radware.com>' > + email = create_email('test', msgid=message_id) > + > + expected = '5899d592-8c87-47d9-92b6-d34260ce1...@radware.com>' > + actual = parser.find_message_id(email) > + > + self.assertEqual(expected, actual) > + > + > class TestFindReferences(TestCase): > > def test_find_references__header_with_comments(self): _______________________________________________ Patchwork mailing list Patchwork@lists.ozlabs.org https://lists.ozlabs.org/listinfo/patchwork