On 06/05/2017 11:01 AM, sebb wrote:
> On 5 June 2017 at 09:17, Daniel Gruno <[email protected]> wrote:
>> I missed a git add in the last commit, sorry. Will add and recommit now.
> 
> Have you tested that the change is complete?
> 
> I'm still getting an error.

Tested it with a bunch of mbox files, some with, some without headers,
subjects, senders etc. All seemed to work.

What is the specific error you are getting, and which generator are you
using?

> 
>> On 06/05/2017 01:57 AM, sebb wrote:
>>> On 4 June 2017 at 15:02,  <[email protected]> wrote:
>>>> split generators into a file of its own
>>>>
>>>> Also fix up generators:
>>>> - medium goes back to the way it was
>>>> - a new 'redundant' generator for cluster setups
>>>>
>>>>
>>>> Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
>>>> Commit: 
>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/e2d81036
>>>> Tree: 
>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/e2d81036
>>>> Diff: 
>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/e2d81036
>>>>
>>>> Branch: refs/heads/master
>>>> Commit: e2d8103635db012d13fc6af46d336c96be31d4c1
>>>> Parents: 8b7ede8
>>>> Author: Daniel Gruno <[email protected]>
>>>> Authored: Sun Jun 4 15:45:18 2017 +0200
>>>> Committer: Daniel Gruno <[email protected]>
>>>> Committed: Sun Jun 4 15:45:18 2017 +0200
>>>>
>>>> ----------------------------------------------------------------------
>>>>  tools/archiver.py   | 17 ++++-------
>>>>  tools/generators.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 80 insertions(+), 11 deletions(-)
>>>> ----------------------------------------------------------------------
>>>>
>>>>
>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/archiver.py
>>>> ----------------------------------------------------------------------
>>>> diff --git a/tools/archiver.py b/tools/archiver.py
>>>> index 41933f7..0966b13 100755
>>>> --- a/tools/archiver.py
>>>> +++ b/tools/archiver.py
>>>> @@ -58,6 +58,7 @@ import io
>>>>  import logging
>>>>  import traceback
>>>>  import sys
>>>> +import generators
>>>>
>>>>  # Fetch config
>>>>  path = os.path.dirname(os.path.realpath(__file__))
>>>> @@ -316,20 +317,14 @@ class Archiver(object):
>>>>          if body is not None or attachments:
>>>>              pmid = mid
>>>>              try:
>>>> -                # Use full message as bytes for mid?
>>>>                  if archiver_generator == "full":
>>>> -                    mid = "%s@%s" % 
>>>> (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>>> +                    mid = generators.full(msg, body, lid, attachments)
>>>>                  elif archiver_generator == "medium":
>>>> -                    xbody = body if type(body) is bytes else 
>>>> body.encode('ascii', 'ignore')
>>>> -                    xbody += bytes(lid, encoding='ascii')
>>>> -                    xbody += bytes(mdatestring, encoding='ascii')
>>>> -                    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), 
>>>> lid)
>>>> -                    if attachments:
>>>> -                        for a in attachments:
>>>> -                            xbody += bytes(a['hash'], encoding = 'ascii')
>>>> +                    mid = generators.medium(msg, body, lid, attachments)
>>>> +                elif archiver_generator == "redundant":
>>>> +                    mid = generators.redundant(msg, body, lid, 
>>>> attachments)
>>>>                  else:
>>>> -                    # Or revert to the old way?
>>>> -                    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) 
>>>> is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>>> +                    mid = generators.legacy(msg, body, lid, attachments)
>>>>              except Exception as err:
>>>>                  if logger:
>>>>                      logger.warn("Could not generate MID: %s" % err)
>>>>
>>>> http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/e2d81036/tools/generators.py
>>>> ----------------------------------------------------------------------
>>>> diff --git a/tools/generators.py b/tools/generators.py
>>>> new file mode 100644
>>>> index 0000000..af566fc
>>>> --- /dev/null
>>>> +++ b/tools/generators.py
>>>> @@ -0,0 +1,74 @@
>>>> +#!/usr/bin/env/python3
>>>> +# -*- coding: utf-8 -*-
>>>> +# Licensed to the Apache Software Foundation (ASF) under one or more
>>>> +# contributor license agreements.  See the NOTICE file distributed with
>>>> +# this work for additional information regarding copyright ownership.
>>>> +# The ASF licenses this file to You under the Apache License, Version 2.0
>>>> +# (the "License"); you may not use this file except in compliance with
>>>> +# the License.  You may obtain a copy of the License at
>>>> +#
>>>> +#     http://www.apache.org/licenses/LICENSE-2.0
>>>> +#
>>>> +# Unless required by applicable law or agreed to in writing, software
>>>> +# distributed under the License is distributed on an "AS IS" BASIS,
>>>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>>> +# See the License for the specific language governing permissions and
>>>> +# limitations under the License.
>>>> +
>>>> +"""
>>>> +This file contains the various ID generators for Pony Mail's archivers.
>>>> +"""
>>>> +
>>>> +import hashlib
>>>> +import email.utils
>>>> +
>>>> +# Full generator: uses the entire email (including sever-depenent data)
>>>> +# This is the recommended generator for single-node setups.
>>>> +def full(msg, body, lid, attachments):
>>>> +    mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
>>>> +    return mid
>>>> +
>>>> +# Medium: Standard generator
>>>> +def medium(msg, body, lid, attachments):
>>>> +    # Use text body
>>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 
>>>> 'ignore')
>>>> +    # Use List ID
>>>> +    xbody += bytes(lid, encoding='ascii')
>>>> +    # Use Date header
>>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>>
>>> mdatestring is not defined
>>>
>>>> +    mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>>> +    return mid
>>>> +
>>>> +# Redundant: Use data that is guaranteed to be the same across redundant 
>>>> setups
>>>> +# This is the recommended generator for redundant cluster setups
>>>> +def redundant(msg, body, lid, attachments):
>>>> +    # Use text body
>>>> +    xbody = body if type(body) is bytes else body.encode('ascii', 
>>>> 'ignore')
>>>> +    # Use List ID
>>>> +    xbody += bytes(lid, encoding='ascii')
>>>> +    # Use Date header
>>>> +    xbody += bytes(mdatestring, encoding='ascii')
>>>
>>> mdatestring is not defined
>>>
>>>> +    # Use sender
>>>> +    sender = msg.get('from', None)
>>>> +    if sender:
>>>> +        xbody += bytes(sender, encoding = 'ascii')
>>>> +    # Use subject
>>>> +    if subject:
>>>> +        xbody += bytes(subject, encoding = 'ascii')
>>>> +    # Use attachment hashes if present
>>>> +    if attachments:
>>>> +        for a in attachments:
>>>> +            xbody += bytes(a['hash'], encoding = 'ascii')
>>>> +    mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
>>>> +    return mid
>>>> +
>>>> +
>>>> +# Old school way of making IDs
>>>> +def legacy(msg, body, lid, attachments):
>>>> +    mdate = email.utils.parsedate_tz(msg.get('date'))
>>>> +    uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is 
>>>> valid
>>>> +    mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else 
>>>> body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
>>>> +    return mid
>>>> +
>>>> +
>>>> +
>>>
>>> Have the generators been tested?
>>>
>>

Reply via email to