This commit broke the functioning of the --generator option for the archiver.

Please fix.

On Mon, 10 Aug 2020 at 17:13, <humbed...@apache.org> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> humbedooh pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
>
>
> The following commit(s) were added to refs/heads/master by this push:
>      new 95beb51  Pull main operation into main(), linting/PEP conformity 
> fixes
> 95beb51 is described below
>
> commit 95beb51158b58bcb9fdb1371af7699b72598ac34
> Author: Daniel Gruno <humbed...@apache.org>
> AuthorDate: Mon Aug 10 18:13:05 2020 +0200
>
>     Pull main operation into main(), linting/PEP conformity fixes
>
>     Also prepping for adding ES 7.x support.
>     import-mbox will need to be updated shortly
> ---
>  tools/archiver.py | 123 
> +++++++++++++++++++++++++-----------------------------
>  1 file changed, 58 insertions(+), 65 deletions(-)
>
> diff --git a/tools/archiver.py b/tools/archiver.py
> index b97db76..2c50f19 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -62,23 +62,19 @@ import uuid
>  import json
>  import certifi
>  import urllib.parse
> +import argparse
> +import netaddr
>
>  # Fetch config
>  config = PonymailConfig()
>  auth = None
> -parseHTML = False
> -iBody = None  # N.B. Also used by import-mbox.py
> -args=None
> -dumpDir = None
> -aURL = None
>
>  if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
>      from zope.interface import implementer
>      from mailman.interfaces.archiver import IArchiver
>      from mailman.interfaces.archiver import ArchivePolicy
>      logger = logging.getLogger("mailman.archiver")
> -elif __name__ == '__main__':
> -    import argparse
> +
>
>  if config.has_option('archiver', 'baseurl'):
>      aURL = config.get('archiver', 'baseurl')
> @@ -102,8 +98,7 @@ def parse_attachment(part):
>          if cdtype == "attachment" or cdtype == 'inline':
>              fd = part.get_payload(decode=True)
>              # Allow for empty string
> -            if fd is None:
> -                return None, None
> +            if fd is None: return None, None
>              filename = part.get_filename()
>              if filename:
>                  print("Found attachment: %s" % filename)
> @@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>
>      """ Intercept index calls and fix up consistency argument """
>      def index(self, **kwargs):
> -        if ES_MAJOR in [5,6]:
> +        if ES_MAJOR in [5,6,7]:
>              if kwargs.pop('consistency', None): # drop the key if present
>                  if self.wait_for_active_shards: # replace with wait if 
> defined
>                      kwargs['wait_for_active_shards'] = 
> self.wait_for_active_shards
> @@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by 
> import-mbox.py
>              **kwargs
>          )
>
> -    def __init__(self, parseHTML=False):
> +    def __init__(self, generator='full', parse_html=False, dump_dir=None):
>          """ Just initialize ES. """
> -        self.html = parseHTML
> -        if parseHTML:
> +        self.html = parse_html
> +        self.generator = generator
> +        if parse_html:
>              import html2text
>              self.html2text = html2text.html2text
>          self.dbname = config.get("elasticsearch", "dbname")
> @@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          self.consistency = config.get('elasticsearch', 'write', 
> fallback='quorum')
>          if ES_MAJOR == 2:
>              pass
> -        elif ES_MAJOR in [5,6]:
> +        elif ES_MAJOR in [5,6,7]:
>              self.wait_for_active_shards = config.get('elasticsearch', 
> 'wait', fallback=1)
>          else:
>              raise Exception("Unexpected elasticsearch version ", ES_VERSION)
> @@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by 
> import-mbox.py
>              }
>              )
>          # If we have a dump dir, we can risk failing the connection.
> -        if dumpDir:
> +        if dump_dir:
>              try:
>                  self.es = Elasticsearch(dbs,
>                      max_retries=5,
>                      retry_on_timeout=True
>                      )
>              except:
> -                print("ES connection failed, but dumponfail specified, 
> dumping to %s" % dumpDir)
> +                print("ES connection failed, but dumponfail specified, 
> dumping to %s" % dump_dir)
>          else:
>              self.es = Elasticsearch(dbs,
>                  max_retries=5,
> @@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by 
> import-mbox.py
>                  contents[part_meta['hash']] = part_file
>          return attachments, contents
>
> -
> -    def msgbody(self, msg):
> +    def msgbody(self, msg, verbose=False, ignore_body=None):
>          body = None
>          firstHTML = None
>          for part in msg.walk():
>              # can be called from importer
> -            if args and args.verbose:
> +            if verbose:
>                  print("Content-Type: %s" % part.get_content_type())
>              """
>                  Find the first body part and the first HTML part
> @@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by 
> import-mbox.py
>                  if not body and part.get_content_type() == 'text/enriched':
>                      body = part.get_payload(decode=True)
>                  elif self.html and not firstHTML and part.get_content_type() 
> == 'text/html':
> -                    firstHTML = part.get_payload(decode=True)
> +                    first_html = part.get_payload(decode=True)
>              except Exception as err:
>                  print(err)
>
>          # this requires a GPL lib, user will have to install it themselves
> -        if firstHTML and (not body or len(body) <= 1 or (iBody and 
> str(body).find(str(iBody)) != -1)):
> +        if firstHTML and (not body or len(body) <= 1 or (ignore_body and 
> str(body).find(str(ignore_body)) != -1)):
>              body = self.html2text(firstHTML.decode("utf-8", 'ignore') if 
> type(firstHTML) is bytes else firstHTML)
>
>          # See issue#463
> @@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          return body
>
>      # N.B. this is also called by import-mbox.py
> -    def compute_updates(self, lid, private, msg):
> +    def compute_updates(self, args, lid, private, msg):
>          """Determine what needs to be sent to the archiver.
>
>          :param lid: The list id
> @@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          # mdate calculations are all done, prepare the index entry
>          epoch = email.utils.mktime_tz(mdate)
>          mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
> -        body = self.msgbody(msg)
> +        body = self.msgbody(msg, verbose=args.verbose, 
> ignore_body=args.ibody)
>          try:
>              if 'content-type' in msg_metadata and 
> msg_metadata['content-type'].find("flowed") != -1:
>                  body = convertToWrapped(body, character_set="utf-8")
> @@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>              except Exception as err:
>                  if logger:
>                      # N.B. use .get just in case there is no message-id
> -                    logger.warning("Could not generate MID: %s. MSGID: %s", 
> err, msg_metadata.get('message-id','?'))
> +                    logger.warning("Could not generate MID: %s. MSGID: %s", 
> err, msg_metadata.get('message-id', '?'))
>                  mid = pmid
> +
>              if 'in-reply-to' in msg_metadata:
>                  try:
>                      try:
> @@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>
>          return  ojson, contents, msg_metadata, irt
>
> -    def archive_message(self, mlist, msg, raw_msg):
> +    def archive_message(self, args, mlist, msg, raw_message):
>          """Send the message to the archiver.
>
>          :param mlist: The IMailingList object.
> @@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is 
> not ArchivePolicy.public:
>              private = True
>
> -        ojson, contents, msg_metadata, irt = self.compute_updates(lid, 
> private, msg)
> +        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, 
> private, msg)
>          if not ojson:
>              _id = msg.get('message-id') or msg.get('Subject') or 
> msg.get("Date")
>              raise Exception("Could not parse message %s for %s" % (_id,lid))
> @@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by 
> import-mbox.py
>                  consistency = self.consistency,
>                  body = {
>                      "message-id": msg_metadata['message-id'],
> -                    "source": self.mbox_source(raw_msg)
> +                    "source": self.mbox_source(raw_message)
>                  }
>              )
>          # If we have a dump dir and ES failed, push to dump dir instead as a 
> JSON object
>          # We'll leave it to another process to pick up the slack.
>          except Exception as err:
> -            if dumpDir:
> +            if args.dump:
>                  print("Pushing to ES failed, but dumponfail specified, 
> dumping JSON docs")
>                  uid = uuid.uuid4()
> -                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
> +                mboxPath = os.path.join(args.dump, "%s.json" % uid)
>                  with open(mboxPath, "w") as f:
>                      json.dump({
>                          'id': ojson['mid'],
>                          'mbox': ojson,
>                          'mbox_source': {
>                              "message-id": msg_metadata['message-id'],
> -                            "source": self.mbox_source(raw_msg)
> +                            "source": self.mbox_source(raw_message)
>                          },
>                          'attachments': contents
>                      },f , indent = 2)
> @@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>              if dm:
>                  cid = dm.group(1)
>                  mid = dm.group(2)
> -                if self.es.exists(index = self.dbname, doc_type = 'account', 
> id = cid):
> -                    doc = self.es.get(index = self.dbname, doc_type = 
> 'account', id = cid)
> +                if self.es.exists(index=self.dbname, doc_type='account', 
> id=cid):
> +                    doc = self.es.get(index=self.dbname, doc_type='account', 
> id=cid)
>                      if doc:
>                          oldrefs.append(cid)
>                          # N.B. no index is supplied, so ES will generate one
> @@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
>          """
>          return None
>
> -if __name__ == '__main__':
> +
> +def main():
>      parser = argparse.ArgumentParser(description='Command line options.')
>      parser.add_argument('--lid', dest='lid', type=str, nargs=1,
>                         help='Alternate specific list ID')
> @@ -593,16 +590,15 @@ if __name__ == '__main__':
>                         help='Try to convert HTML to text if no text/plain 
> message is found')
>      parser.add_argument('--dry', dest='dry', action='store_true',
>                         help='Do not save emails to elasticsearch, only test 
> parsing')
> +    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
> +                        help='Optional email bodies to treat as empty (in 
> conjunction with --html2text)')
>      parser.add_argument('--dumponfail', dest='dump',
> -                       help='If pushing to ElasticSearch fails, dump 
> documents in JSON format to this directory and fail silently.')
> +                       help='If pushing to ElasticSearch fails, dump 
> documents in JSON format to this directory and '
> +                            'fail silently.')
>      parser.add_argument('--generator', dest='generator',
>                         help='Override the generator.')
>      args = parser.parse_args()
>
> -    if args.html2text:
> -        parseHTML = True
> -    if args.dump:
> -        dumpDir = args.dump
>      if args.verbose:
>          logging.basicConfig(stream=sys.stdout, level=logging.INFO)
>      else:
> @@ -610,39 +606,34 @@ if __name__ == '__main__':
>          # Also eliminates: 'Undecodable raw error response from server:' 
> warning message
>          logging.getLogger("elasticsearch").setLevel(logging.ERROR)
>
> -    if args.generator:
> -        archiver_generator = args.generator
> -
> -    archie = Archiver(parseHTML = parseHTML)
> +    archie = Archiver(generator=args.generator or archiver_generator, 
> parse_html=args.html2text)
>      # use binary input so parser can use appropriate charset
>      input_stream = sys.stdin.buffer
>
>      try:
> -        msgstring = input_stream.read()
> +        raw_message = input_stream.read()
>          try:
> -            msg = email.message_from_bytes(msgstring)
> +            msg = email.message_from_bytes(raw_message)
>          except Exception as err:
>              print("STDIN parser exception: %s" % err)
>
>          # We're reading from STDIN, so let's fake an MM3 call
>          ispublic = True
> -        ignorefrom = None
> -        allowfrom = None
>
>          if args.altheader:
> -            altheader = args.altheader[0]
> -            if altheader in msg:
> +            alt_header = args.altheader[0]
> +            if alt_header in msg:
>                  try:
> -                    msg.replace_header('List-ID', msg.get(altheader))
> +                    msg.replace_header('List-ID', msg.get(alt_header))
>                  except:
> -                    msg.add_header('list-id', msg.get(altheader))
> +                    msg.add_header('list-id', msg.get(alt_header))
>          elif 'altheader' in sys.argv:
> -            altheader = sys.argv[len(sys.argv)-1]
> -            if altheader in msg:
> +            alt_header = sys.argv[len(sys.argv)-1]
> +            if alt_header in msg:
>                  try:
> -                    msg.replace_header('List-ID', msg.get(altheader))
> +                    msg.replace_header('List-ID', msg.get(alt_header))
>                  except:
> -                    msg.add_header('list-id', msg.get(altheader))
> +                    msg.add_header('list-id', msg.get(alt_header))
>
>          # Set specific LID?
>          if args.lid and len(args.lid[0]) > 3:
> @@ -654,21 +645,21 @@ if __name__ == '__main__':
>
>          #Ignore based on --ignore flag?
>          if args.ignorefrom:
> -            ignorefrom = args.ignorefrom[0]
> -            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or 
> (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
> +            ignore_from = args.ignorefrom[0]
> +            if fnmatch.fnmatch(msg.get("from"), ignore_from) or 
> (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignore_from)):
>                  print("Ignoring message as instructed by --ignore flag")
>                  sys.exit(0)
>
>          # Check CIDR if need be
>          if args.allowfrom:
> -            from netaddr import IPNetwork, IPAddress
> -            c = IPNetwork(args.allowfrom[0])
> +
> +            c = netaddr.IPNetwork(args.allowfrom[0])
>              good = False
>              for line in msg.get_all('received') or []:
>                  m = re.search(r"from .+\[(.+)\]", line)
>                  if m:
>                      try:
> -                        ip = IPAddress(m.group(1))
> +                        ip = netaddr.IPAddress(m.group(1))
>                          if ip in c:
>                              good = True
>                              msg.add_header("ip-whitelisted", "yes")
> @@ -681,19 +672,18 @@ if __name__ == '__main__':
>          # Replace date header with $now?
>          if args.makedate:
>              msg.replace_header('date', email.utils.formatdate())
> -
> +        is_public = True
>          if args.private:
> -            ispublic = False
> +            is_public = False
>          if 'list-id' in msg:
>              if not msg.get('archived-at'):
>                  msg.add_header('archived-at', email.utils.formatdate())
> -            list_data = namedtuple('importmsg', ['list_id', 
> 'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
> +            list_data = namedtuple('importmsg', ['list_id', 
> 'archive_public'])(list_id=msg.get('list-id'),
> +                                                                             
>   archive_public=is_public)
>
>              try:
> -                lid, mid = archie.archive_message(list_data, msg, msgstring)
> +                lid, mid = archie.archive_message(args, list_data, msg, 
> raw_message)
>                  print("%s: Done archiving to %s as %s!" % 
> (email.utils.formatdate(), lid, mid))
> -                if aURL:
> -                    print("Published as: %s/thread.html/%s" % (aURL, 
> urllib.parse.quote(mid)))
>              except Exception as err:
>                  if args.verbose:
>                      traceback.print_exc()
> @@ -711,3 +701,6 @@ if __name__ == '__main__':
>              print("Could not parse email: %s (@ %s)" % (err, line))
>              sys.exit(-1)
>
> +
> +if __name__ == '__main__':
> +    main()
>

Reply via email to