On 15/08/2020 14.50, sebb wrote:
There was another incompatible change:

The Archiver __init__ method keyword parameter parse_html was renamed
to parseHtml.
This breaks compatibility with previous versions, making testing of
older versions much harder.
It's probably OK to add new keyword parameters, but please don't make
arbitrary changes to existing parameters.

I think it's the other way around - parseHTML was renamed to parse_html.
It wasn't completely arbitrary, it was to conform with PEP8 guidelines and generally harmonizing the way we type arguments in PM.

I can get the test suite to work around this, not a problem.
I'd much rather have a version-aware unit tester than having to stick to the same parameters for all eternity, as that may hinder future development. One example being the DKIM PR where the raw message body now needs to get passed through (which arguably is much better than relying on .as_bytes()).


On Fri, 14 Aug 2020 at 18:47, Daniel Gruno <humbed...@apache.org> wrote:

Apoologies, should be in working condition again now.

On 14/08/2020 16.55, sebb wrote:
This commit broke the functioning of the --generator option for the archiver.

Please fix.

On Mon, 10 Aug 2020 at 17:13, <humbed...@apache.org> wrote:

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git


The following commit(s) were added to refs/heads/master by this push:
       new 95beb51  Pull main operation into main(), linting/PEP conformity 
fixes
95beb51 is described below

commit 95beb51158b58bcb9fdb1371af7699b72598ac34
Author: Daniel Gruno <humbed...@apache.org>
AuthorDate: Mon Aug 10 18:13:05 2020 +0200

      Pull main operation into main(), linting/PEP conformity fixes

      Also prepping for adding ES 7.x support.
      import-mbox will need to be updated shortly
---
   tools/archiver.py | 123 
+++++++++++++++++++++++++-----------------------------
   1 file changed, 58 insertions(+), 65 deletions(-)

diff --git a/tools/archiver.py b/tools/archiver.py
index b97db76..2c50f19 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -62,23 +62,19 @@ import uuid
   import json
   import certifi
   import urllib.parse
+import argparse
+import netaddr

   # Fetch config
   config = PonymailConfig()
   auth = None
-parseHTML = False
-iBody = None  # N.B. Also used by import-mbox.py
-args=None
-dumpDir = None
-aURL = None

   if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
       from zope.interface import implementer
       from mailman.interfaces.archiver import IArchiver
       from mailman.interfaces.archiver import ArchivePolicy
       logger = logging.getLogger("mailman.archiver")
-elif __name__ == '__main__':
-    import argparse
+

   if config.has_option('archiver', 'baseurl'):
       aURL = config.get('archiver', 'baseurl')
@@ -102,8 +98,7 @@ def parse_attachment(part):
           if cdtype == "attachment" or cdtype == 'inline':
               fd = part.get_payload(decode=True)
               # Allow for empty string
-            if fd is None:
-                return None, None
+            if fd is None: return None, None
               filename = part.get_filename()
               if filename:
                   print("Found attachment: %s" % filename)
@@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py

       """ Intercept index calls and fix up consistency argument """
       def index(self, **kwargs):
-        if ES_MAJOR in [5,6]:
+        if ES_MAJOR in [5,6,7]:
               if kwargs.pop('consistency', None): # drop the key if present
                   if self.wait_for_active_shards: # replace with wait if 
defined
                       kwargs['wait_for_active_shards'] = 
self.wait_for_active_shards
@@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by import-mbox.py
               **kwargs
           )

-    def __init__(self, parseHTML=False):
+    def __init__(self, generator='full', parse_html=False, dump_dir=None):
           """ Just initialize ES. """
-        self.html = parseHTML
-        if parseHTML:
+        self.html = parse_html
+        self.generator = generator
+        if parse_html:
               import html2text
               self.html2text = html2text.html2text
           self.dbname = config.get("elasticsearch", "dbname")
@@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
           self.consistency = config.get('elasticsearch', 'write', 
fallback='quorum')
           if ES_MAJOR == 2:
               pass
-        elif ES_MAJOR in [5,6]:
+        elif ES_MAJOR in [5,6,7]:
               self.wait_for_active_shards = config.get('elasticsearch', 
'wait', fallback=1)
           else:
               raise Exception("Unexpected elasticsearch version ", ES_VERSION)
@@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by import-mbox.py
               }
               )
           # If we have a dump dir, we can risk failing the connection.
-        if dumpDir:
+        if dump_dir:
               try:
                   self.es = Elasticsearch(dbs,
                       max_retries=5,
                       retry_on_timeout=True
                       )
               except:
-                print("ES connection failed, but dumponfail specified, dumping to 
%s" % dumpDir)
+                print("ES connection failed, but dumponfail specified, dumping to 
%s" % dump_dir)
           else:
               self.es = Elasticsearch(dbs,
                   max_retries=5,
@@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
                   contents[part_meta['hash']] = part_file
           return attachments, contents

-
-    def msgbody(self, msg):
+    def msgbody(self, msg, verbose=False, ignore_body=None):
           body = None
           firstHTML = None
           for part in msg.walk():
               # can be called from importer
-            if args and args.verbose:
+            if verbose:
                   print("Content-Type: %s" % part.get_content_type())
               """
                   Find the first body part and the first HTML part
@@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
                   if not body and part.get_content_type() == 'text/enriched':
                       body = part.get_payload(decode=True)
                   elif self.html and not firstHTML and part.get_content_type() 
== 'text/html':
-                    firstHTML = part.get_payload(decode=True)
+                    first_html = part.get_payload(decode=True)
               except Exception as err:
                   print(err)

           # this requires a GPL lib, user will have to install it themselves
-        if firstHTML and (not body or len(body) <= 1 or (iBody and 
str(body).find(str(iBody)) != -1)):
+        if firstHTML and (not body or len(body) <= 1 or (ignore_body and 
str(body).find(str(ignore_body)) != -1)):
               body = self.html2text(firstHTML.decode("utf-8", 'ignore') if 
type(firstHTML) is bytes else firstHTML)

           # See issue#463
@@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
           return body

       # N.B. this is also called by import-mbox.py
-    def compute_updates(self, lid, private, msg):
+    def compute_updates(self, args, lid, private, msg):
           """Determine what needs to be sent to the archiver.

           :param lid: The list id
@@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
           # mdate calculations are all done, prepare the index entry
           epoch = email.utils.mktime_tz(mdate)
           mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
-        body = self.msgbody(msg)
+        body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
           try:
               if 'content-type' in msg_metadata and 
msg_metadata['content-type'].find("flowed") != -1:
                   body = convertToWrapped(body, character_set="utf-8")
@@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by import-mbox.py
               except Exception as err:
                   if logger:
                       # N.B. use .get just in case there is no message-id
-                    logger.warning("Could not generate MID: %s. MSGID: %s", 
err, msg_metadata.get('message-id','?'))
+                    logger.warning("Could not generate MID: %s. MSGID: %s", 
err, msg_metadata.get('message-id', '?'))
                   mid = pmid
+
               if 'in-reply-to' in msg_metadata:
                   try:
                       try:
@@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py

           return  ojson, contents, msg_metadata, irt

-    def archive_message(self, mlist, msg, raw_msg):
+    def archive_message(self, args, mlist, msg, raw_message):
           """Send the message to the archiver.

           :param mlist: The IMailingList object.
@@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
           elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is 
not ArchivePolicy.public:
               private = True

-        ojson, contents, msg_metadata, irt = self.compute_updates(lid, 
private, msg)
+        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, 
private, msg)
           if not ojson:
               _id = msg.get('message-id') or msg.get('Subject') or 
msg.get("Date")
               raise Exception("Could not parse message %s for %s" % (_id,lid))
@@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by import-mbox.py
                   consistency = self.consistency,
                   body = {
                       "message-id": msg_metadata['message-id'],
-                    "source": self.mbox_source(raw_msg)
+                    "source": self.mbox_source(raw_message)
                   }
               )
           # If we have a dump dir and ES failed, push to dump dir instead as a 
JSON object
           # We'll leave it to another process to pick up the slack.
           except Exception as err:
-            if dumpDir:
+            if args.dump:
                   print("Pushing to ES failed, but dumponfail specified, dumping 
JSON docs")
                   uid = uuid.uuid4()
-                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
+                mboxPath = os.path.join(args.dump, "%s.json" % uid)
                   with open(mboxPath, "w") as f:
                       json.dump({
                           'id': ojson['mid'],
                           'mbox': ojson,
                           'mbox_source': {
                               "message-id": msg_metadata['message-id'],
-                            "source": self.mbox_source(raw_msg)
+                            "source": self.mbox_source(raw_message)
                           },
                           'attachments': contents
                       },f , indent = 2)
@@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
               if dm:
                   cid = dm.group(1)
                   mid = dm.group(2)
-                if self.es.exists(index = self.dbname, doc_type = 'account', 
id = cid):
-                    doc = self.es.get(index = self.dbname, doc_type = 
'account', id = cid)
+                if self.es.exists(index=self.dbname, doc_type='account', 
id=cid):
+                    doc = self.es.get(index=self.dbname, doc_type='account', 
id=cid)
                       if doc:
                           oldrefs.append(cid)
                           # N.B. no index is supplied, so ES will generate one
@@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
           """
           return None

-if __name__ == '__main__':
+
+def main():
       parser = argparse.ArgumentParser(description='Command line options.')
       parser.add_argument('--lid', dest='lid', type=str, nargs=1,
                          help='Alternate specific list ID')
@@ -593,16 +590,15 @@ if __name__ == '__main__':
                          help='Try to convert HTML to text if no text/plain 
message is found')
       parser.add_argument('--dry', dest='dry', action='store_true',
                          help='Do not save emails to elasticsearch, only test 
parsing')
+    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
+                        help='Optional email bodies to treat as empty (in 
conjunction with --html2text)')
       parser.add_argument('--dumponfail', dest='dump',
-                       help='If pushing to ElasticSearch fails, dump documents 
in JSON format to this directory and fail silently.')
+                       help='If pushing to ElasticSearch fails, dump documents 
in JSON format to this directory and '
+                            'fail silently.')
       parser.add_argument('--generator', dest='generator',
                          help='Override the generator.')
       args = parser.parse_args()

-    if args.html2text:
-        parseHTML = True
-    if args.dump:
-        dumpDir = args.dump
       if args.verbose:
           logging.basicConfig(stream=sys.stdout, level=logging.INFO)
       else:
@@ -610,39 +606,34 @@ if __name__ == '__main__':
           # Also eliminates: 'Undecodable raw error response from server:' 
warning message
           logging.getLogger("elasticsearch").setLevel(logging.ERROR)

-    if args.generator:
-        archiver_generator = args.generator
-
-    archie = Archiver(parseHTML = parseHTML)
+    archie = Archiver(generator=args.generator or archiver_generator, 
parse_html=args.html2text)
       # use binary input so parser can use appropriate charset
       input_stream = sys.stdin.buffer

       try:
-        msgstring = input_stream.read()
+        raw_message = input_stream.read()
           try:
-            msg = email.message_from_bytes(msgstring)
+            msg = email.message_from_bytes(raw_message)
           except Exception as err:
               print("STDIN parser exception: %s" % err)

           # We're reading from STDIN, so let's fake an MM3 call
           ispublic = True
-        ignorefrom = None
-        allowfrom = None

           if args.altheader:
-            altheader = args.altheader[0]
-            if altheader in msg:
+            alt_header = args.altheader[0]
+            if alt_header in msg:
                   try:
-                    msg.replace_header('List-ID', msg.get(altheader))
+                    msg.replace_header('List-ID', msg.get(alt_header))
                   except:
-                    msg.add_header('list-id', msg.get(altheader))
+                    msg.add_header('list-id', msg.get(alt_header))
           elif 'altheader' in sys.argv:
-            altheader = sys.argv[len(sys.argv)-1]
-            if altheader in msg:
+            alt_header = sys.argv[len(sys.argv)-1]
+            if alt_header in msg:
                   try:
-                    msg.replace_header('List-ID', msg.get(altheader))
+                    msg.replace_header('List-ID', msg.get(alt_header))
                   except:
-                    msg.add_header('list-id', msg.get(altheader))
+                    msg.add_header('list-id', msg.get(alt_header))

           # Set specific LID?
           if args.lid and len(args.lid[0]) > 3:
@@ -654,21 +645,21 @@ if __name__ == '__main__':

           #Ignore based on --ignore flag?
           if args.ignorefrom:
-            ignorefrom = args.ignorefrom[0]
-            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or (msg.get("list-id") and 
fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
+            ignore_from = args.ignorefrom[0]
+            if fnmatch.fnmatch(msg.get("from"), ignore_from) or (msg.get("list-id") and 
fnmatch.fnmatch(msg.get("list-id"), ignore_from)):
                   print("Ignoring message as instructed by --ignore flag")
                   sys.exit(0)

           # Check CIDR if need be
           if args.allowfrom:
-            from netaddr import IPNetwork, IPAddress
-            c = IPNetwork(args.allowfrom[0])
+
+            c = netaddr.IPNetwork(args.allowfrom[0])
               good = False
               for line in msg.get_all('received') or []:
                   m = re.search(r"from .+\[(.+)\]", line)
                   if m:
                       try:
-                        ip = IPAddress(m.group(1))
+                        ip = netaddr.IPAddress(m.group(1))
                           if ip in c:
                               good = True
                               msg.add_header("ip-whitelisted", "yes")
@@ -681,19 +672,18 @@ if __name__ == '__main__':
           # Replace date header with $now?
           if args.makedate:
               msg.replace_header('date', email.utils.formatdate())
-
+        is_public = True
           if args.private:
-            ispublic = False
+            is_public = False
           if 'list-id' in msg:
               if not msg.get('archived-at'):
                   msg.add_header('archived-at', email.utils.formatdate())
-            list_data = namedtuple('importmsg', ['list_id', 
'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
+            list_data = namedtuple('importmsg', ['list_id', 
'archive_public'])(list_id=msg.get('list-id'),
+                                                                               
archive_public=is_public)

               try:
-                lid, mid = archie.archive_message(list_data, msg, msgstring)
+                lid, mid = archie.archive_message(args, list_data, msg, 
raw_message)
                   print("%s: Done archiving to %s as %s!" % 
(email.utils.formatdate(), lid, mid))
-                if aURL:
-                    print("Published as: %s/thread.html/%s" % (aURL, 
urllib.parse.quote(mid)))
               except Exception as err:
                   if args.verbose:
                       traceback.print_exc()
@@ -711,3 +701,6 @@ if __name__ == '__main__':
               print("Could not parse email: %s (@ %s)" % (err, line))
               sys.exit(-1)

+
+if __name__ == '__main__':
+    main()



Reply via email to