On 5 June 2017 at 10:04, Daniel Gruno <[email protected]> wrote: > On 06/05/2017 11:01 AM, sebb wrote: >> On 5 June 2017 at 09:17, Daniel Gruno <[email protected]> wrote: >>> I missed a git add in the last commit, sorry. Will add and recommit now. >> >> Have you tested that the change is complete? >> >> I'm still getting an error. > > Tested it with a bunch of mbox files, some with, some without headers, > subjects, senders etc. All seemed to work. > > What is the specific error you are getting, and which generator are you > using?
I am using the medium generator. NameError: name 'msg_metadata' is not defined >> >>> On 06/05/2017 01:57 AM, sebb wrote: >>>> On 4 June 2017 at 15:02, <[email protected]> wrote: >>>>> split generators into a file of its own >>>>> >>>>> Also fix up generators: >>>>> - medium goes back to the way it was >>>>> - a new 'redundant' generator for cluster setups >>>>> >>>>> >>>>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo >>>>> Commit: >>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036 >>>>> Tree: >>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036 >>>>> Diff: >>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036 >>>>> >>>>> Branch: refs/heads/master >>>>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1 >>>>> Parents: 8b7ede8 >>>>> Author: Daniel Gruno <[email protected]> >>>>> Authored: Sun Jun 4 15:45:18 2017 +0200 >>>>> Committer: Daniel Gruno <[email protected]> >>>>> Committed: Sun Jun 4 15:45:18 2017 +0200 >>>>> >>>>> ---------------------------------------------------------------------- >>>>> tools/archiver.py | 17 ++++------- >>>>> tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ >>>>> 2 files changed, 80 insertions(+), 11 deletions(-) >>>>> ---------------------------------------------------------------------- >>>>> >>>>> >>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py >>>>> ---------------------------------------------------------------------- >>>>> diff --git a/tools/archiver.py b/tools/archiver.py >>>>> index 41933f7..0966b13 100755 >>>>> --- a/tools/archiver.py >>>>> +++ b/tools/archiver.py >>>>> @@ -58,6 +58,7 @@ import io >>>>> import logging >>>>> import traceback >>>>> import sys >>>>> +import generators >>>>> >>>>> # Fetch config >>>>> path = os.path.dirname(os.path.realpath(__file__)) >>>>> @@ -316,20 +317,14 @@ class Archiver(object): >>>>> if body is not None or attachments: >>>>> pmid = mid >>>>> try: >>>>> - # Use full message as bytes for mid? >>>>> if archiver_generator == "full": >>>>> - mid = "%s@%s" % >>>>> (hashlib.sha224(msg.as_bytes()).hexdigest(), lid) >>>>> + mid = generators.full(msg, body, lid, attachments) >>>>> elif archiver_generator == "medium": >>>>> - xbody = body if type(body) is bytes else >>>>> body.encode('ascii', 'ignore') >>>>> - xbody += bytes(lid, encoding='ascii') >>>>> - xbody += bytes(mdatestring, encoding='ascii') >>>>> - mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), >>>>> lid) >>>>> - if attachments: >>>>> - for a in attachments: >>>>> - xbody += bytes(a['hash'], encoding = 'ascii') >>>>> + mid = generators.medium(msg, body, lid, attachments) >>>>> + elif archiver_generator == "redundant": >>>>> + mid = generators.redundant(msg, body, lid, >>>>> attachments) >>>>> else: >>>>> - # Or revert to the old way? >>>>> - mid = "%s@%s@%s" % (hashlib.sha224(body if >>>>> type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), >>>>> uid_mdate, lid) >>>>> + mid = generators.legacy(msg, body, lid, attachments) >>>>> except Exception as err: >>>>> if logger: >>>>> logger.warn("Could not generate MID: %s" % err) >>>>> >>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py >>>>> ---------------------------------------------------------------------- >>>>> diff --git a/tools/generators.py b/tools/generators.py >>>>> new file mode 100644 >>>>> index 0000000..af566fc >>>>> --- /dev/null >>>>> +++ b/tools/generators.py >>>>> @@ -0,0 +1,74 @@ >>>>> +#!/usr/bin/env/python3 >>>>> +# -*- coding: utf-8 -*- >>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more >>>>> +# contributor license agreements. See the NOTICE file distributed with >>>>> +# this work for additional information regarding copyright ownership. >>>>> +# The ASF licenses this file to You under the Apache License, Version 2.0 >>>>> +# (the "License"); you may not use this file except in compliance with >>>>> +# the License. You may obtain a copy of the License at >>>>> +# >>>>> +# http://www.apache.org/licenses/LICENSE-2.0 >>>>> +# >>>>> +# Unless required by applicable law or agreed to in writing, software >>>>> +# distributed under the License is distributed on an "AS IS" BASIS, >>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or >>>>> implied. >>>>> +# See the License for the specific language governing permissions and >>>>> +# limitations under the License. >>>>> + >>>>> +""" >>>>> +This file contains the various ID generators for Pony Mail's archivers. >>>>> +""" >>>>> + >>>>> +import hashlib >>>>> +import email.utils >>>>> + >>>>> +# Full generator: uses the entire email (including sever-depenent data) >>>>> +# This is the recommended generator for single-node setups. >>>>> +def full(msg, body, lid, attachments): >>>>> + mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid) >>>>> + return mid >>>>> + >>>>> +# Medium: Standard generator >>>>> +def medium(msg, body, lid, attachments): >>>>> + # Use text body >>>>> + xbody = body if type(body) is bytes else body.encode('ascii', >>>>> 'ignore') >>>>> + # Use List ID >>>>> + xbody += bytes(lid, encoding='ascii') >>>>> + # Use Date header >>>>> + xbody += bytes(mdatestring, encoding='ascii') >>>> >>>> mdatestring is not defined >>>> >>>>> + mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid) >>>>> + return mid >>>>> + >>>>> +# Redundant: Use data that is guaranteed to be the same across redundant >>>>> setups >>>>> +# This is the recommended generator for redundant cluster setups >>>>> +def redundant(msg, body, lid, attachments): >>>>> + # Use text body >>>>> + xbody = body if type(body) is bytes else body.encode('ascii', >>>>> 'ignore') >>>>> + # Use List ID >>>>> + xbody += bytes(lid, encoding='ascii') >>>>> + # Use Date header >>>>> + xbody += bytes(mdatestring, encoding='ascii') >>>> >>>> mdatestring is not defined >>>> >>>>> + # Use sender >>>>> + sender = msg.get('from', None) >>>>> + if sender: >>>>> + xbody += bytes(sender, encoding = 'ascii') >>>>> + # Use subject >>>>> + if subject: >>>>> + xbody += bytes(subject, encoding = 'ascii') >>>>> + # Use attachment hashes if present >>>>> + if attachments: >>>>> + for a in attachments: >>>>> + xbody += bytes(a['hash'], encoding = 'ascii') >>>>> + mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid) >>>>> + return mid >>>>> + >>>>> + >>>>> +# Old school way of making IDs >>>>> +def legacy(msg, body, lid, attachments): >>>>> + mdate = email.utils.parsedate_tz(msg.get('date')) >>>>> + uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header >>>>> is valid >>>>> + mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else >>>>> body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) >>>>> + return mid >>>>> + >>>>> + >>>>> + >>>> >>>> Have the generators been tested? >>>> >>> >
