On 06/05/2017 11:01 AM, sebb wrote: > On 5 June 2017 at 09:17, Daniel Gruno <[email protected]> wrote: >> I missed a git add in the last commit, sorry. Will add and recommit now. > > Have you tested that the change is complete? > > I'm still getting an error.
Tested it with a bunch of mbox files, some with, some without headers, subjects, senders etc. All seemed to work. What is the specific error you are getting, and which generator are you using? > >> On 06/05/2017 01:57 AM, sebb wrote: >>> On 4 June 2017 at 15:02, <[email protected]> wrote: >>>> split generators into a file of its own >>>> >>>> Also fix up generators: >>>> - medium goes back to the way it was >>>> - a new 'redundant' generator for cluster setups >>>> >>>> >>>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo >>>> Commit: >>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036 >>>> Tree: >>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036 >>>> Diff: >>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036 >>>> >>>> Branch: refs/heads/master >>>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1 >>>> Parents: 8b7ede8 >>>> Author: Daniel Gruno <[email protected]> >>>> Authored: Sun Jun 4 15:45:18 2017 +0200 >>>> Committer: Daniel Gruno <[email protected]> >>>> Committed: Sun Jun 4 15:45:18 2017 +0200 >>>> >>>> ---------------------------------------------------------------------- >>>> tools/archiver.py | 17 ++++------- >>>> tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ >>>> 2 files changed, 80 insertions(+), 11 deletions(-) >>>> ---------------------------------------------------------------------- >>>> >>>> >>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py >>>> ---------------------------------------------------------------------- >>>> diff --git a/tools/archiver.py b/tools/archiver.py >>>> index 41933f7..0966b13 100755 >>>> --- a/tools/archiver.py >>>> +++ b/tools/archiver.py >>>> @@ -58,6 +58,7 @@ import io >>>> import logging >>>> import traceback >>>> import sys >>>> +import generators >>>> >>>> # Fetch config >>>> path = os.path.dirname(os.path.realpath(__file__)) >>>> @@ -316,20 +317,14 @@ class Archiver(object): >>>> if body is not None or attachments: >>>> pmid = mid >>>> try: >>>> - # Use full message as bytes for mid? >>>> if archiver_generator == "full": >>>> - mid = "%s@%s" % >>>> (hashlib.sha224(msg.as_bytes()).hexdigest(), lid) >>>> + mid = generators.full(msg, body, lid, attachments) >>>> elif archiver_generator == "medium": >>>> - xbody = body if type(body) is bytes else >>>> body.encode('ascii', 'ignore') >>>> - xbody += bytes(lid, encoding='ascii') >>>> - xbody += bytes(mdatestring, encoding='ascii') >>>> - mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), >>>> lid) >>>> - if attachments: >>>> - for a in attachments: >>>> - xbody += bytes(a['hash'], encoding = 'ascii') >>>> + mid = generators.medium(msg, body, lid, attachments) >>>> + elif archiver_generator == "redundant": >>>> + mid = generators.redundant(msg, body, lid, >>>> attachments) >>>> else: >>>> - # Or revert to the old way? >>>> - mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) >>>> is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) >>>> + mid = generators.legacy(msg, body, lid, attachments) >>>> except Exception as err: >>>> if logger: >>>> logger.warn("Could not generate MID: %s" % err) >>>> >>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py >>>> ---------------------------------------------------------------------- >>>> diff --git a/tools/generators.py b/tools/generators.py >>>> new file mode 100644 >>>> index 0000000..af566fc >>>> --- /dev/null >>>> +++ b/tools/generators.py >>>> @@ -0,0 +1,74 @@ >>>> +#!/usr/bin/env/python3 >>>> +# -*- coding: utf-8 -*- >>>> +# Licensed to the Apache Software Foundation (ASF) under one or more >>>> +# contributor license agreements. See the NOTICE file distributed with >>>> +# this work for additional information regarding copyright ownership. >>>> +# The ASF licenses this file to You under the Apache License, Version 2.0 >>>> +# (the "License"); you may not use this file except in compliance with >>>> +# the License. You may obtain a copy of the License at >>>> +# >>>> +# http://www.apache.org/licenses/LICENSE-2.0 >>>> +# >>>> +# Unless required by applicable law or agreed to in writing, software >>>> +# distributed under the License is distributed on an "AS IS" BASIS, >>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >>>> +# See the License for the specific language governing permissions and >>>> +# limitations under the License. >>>> + >>>> +""" >>>> +This file contains the various ID generators for Pony Mail's archivers. >>>> +""" >>>> + >>>> +import hashlib >>>> +import email.utils >>>> + >>>> +# Full generator: uses the entire email (including sever-depenent data) >>>> +# This is the recommended generator for single-node setups. >>>> +def full(msg, body, lid, attachments): >>>> + mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid) >>>> + return mid >>>> + >>>> +# Medium: Standard generator >>>> +def medium(msg, body, lid, attachments): >>>> + # Use text body >>>> + xbody = body if type(body) is bytes else body.encode('ascii', >>>> 'ignore') >>>> + # Use List ID >>>> + xbody += bytes(lid, encoding='ascii') >>>> + # Use Date header >>>> + xbody += bytes(mdatestring, encoding='ascii') >>> >>> mdatestring is not defined >>> >>>> + mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid) >>>> + return mid >>>> + >>>> +# Redundant: Use data that is guaranteed to be the same across redundant >>>> setups >>>> +# This is the recommended generator for redundant cluster setups >>>> +def redundant(msg, body, lid, attachments): >>>> + # Use text body >>>> + xbody = body if type(body) is bytes else body.encode('ascii', >>>> 'ignore') >>>> + # Use List ID >>>> + xbody += bytes(lid, encoding='ascii') >>>> + # Use Date header >>>> + xbody += bytes(mdatestring, encoding='ascii') >>> >>> mdatestring is not defined >>> >>>> + # Use sender >>>> + sender = msg.get('from', None) >>>> + if sender: >>>> + xbody += bytes(sender, encoding = 'ascii') >>>> + # Use subject >>>> + if subject: >>>> + xbody += bytes(subject, encoding = 'ascii') >>>> + # Use attachment hashes if present >>>> + if attachments: >>>> + for a in attachments: >>>> + xbody += bytes(a['hash'], encoding = 'ascii') >>>> + mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid) >>>> + return mid >>>> + >>>> + >>>> +# Old school way of making IDs >>>> +def legacy(msg, body, lid, attachments): >>>> + mdate = email.utils.parsedate_tz(msg.get('date')) >>>> + uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is >>>> valid >>>> + mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else >>>> body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) >>>> + return mid >>>> + >>>> + >>>> + >>> >>> Have the generators been tested? >>> >>
