On Sat, 5 Sep 2020 at 00:44, <[email protected]> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> humbedooh pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
>
> commit f314d5250999e2afb2ab5063d35afe7d1c1114fa
> Author: Daniel Gruno <[email protected]>
> AuthorDate: Sat Sep 5 01:41:57 2020 +0200
>
>     re-align with old pony for cluster generator and unit tests
>
>     The general idea here is, if we find an email without a charset at all,
>     and we detect non-ascii characters in it, we assume it must be UTF-8 and
>     grab the raw bytes. We also convert it internally to a string for the
>     Body class, but we don't set the Body class' character set to anything.
>     This way, we keep the cluster generator happy by passing it bytes, while
>     keeping the rest happy by having a string representation that can be
>     unflowed. As DKIM does not use the msgbody itself, it won't be affected
>     by this change.

This information belongs in the code.

> ---
>  tools/archiver.py           | 53 
> ++++++++++++++++++++++++++++++---------------
>  tools/plugins/generators.py |  7 +++---
>  2 files changed, 40 insertions(+), 20 deletions(-)
>
> diff --git a/tools/archiver.py b/tools/archiver.py
> index cfa3c3a..82ad32c 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -143,9 +143,7 @@ def normalize_lid(lid: str) -> str:  # N.B. Also used by 
> import-mbox.py
>      # Belt-and-braces: remove possible extraneous chars
>      lid = "<%s>" % lid.strip(" <>").replace("@", ".")
>      # Replace invalid characters with underscores so as to not invalidate 
> doc IDs.
> -    lid = re.sub(
> -        r"[^-+~_<>.a-zA-Z0-9@]", "_", lid
> -    )
> +    lid = re.sub(r"[^-+~_<>.a-zA-Z0-9@]", "_", lid)
>      # Finally, ensure we have a loosely valid list ID value
>      if not re.match(r"^<.+\..+>$", lid):
>          print("Invalid list-id %s" % lid)
> @@ -172,24 +170,39 @@ def message_attachments(msg: email.message.Message) -> 
> typing.Tuple[list, dict]:
>  class Body:
>      def __init__(self, part: email.message.Message):
>          self.content_type = part.get_content_type()
> -        self.charsets = set([part.get_content_charset()])  # Part's charset
> -        self.charsets.update(
> -            [part.get_charsets()[0]]
> -        )  # Parent charset as fallback if any/different
> -        self.character_set = "us-ascii"
> +        self.charsets = [part.get_content_charset()]  # Part's charset
> +        parent_charset = part.get_charsets()[0]
> +        if parent_charset and parent_charset != self.charsets[0]:
> +            self.charsets.append(
> +                parent_charset
> +            )  # Parent charset as fallback if any/different
> +        self.character_set = None
> +        self.has_charset = False
>          self.string: typing.Optional[str] = None
>          self.flowed = "format=flowed" in part.get("content-type", "")
> -        contents = part.get_payload(decode=True)
> -        if contents is not None:
> -            for cs in self.charsets:
> -                if cs:
> +        self.bytes = part.get_payload(decode=True)
> +        if self.bytes is not None:
> +            valid_encodings = [x for x in self.charsets if x]
> +            if valid_encodings:
> +                for cs in valid_encodings:
>                      try:
> -                        self.string = contents.decode(cs)
> +                        self.string = self.bytes.decode(cs)
>                          self.character_set = str(cs)
> +                        self.has_charset = True
> +                        break
>                      except UnicodeDecodeError:
>                          pass
>              if not self.string:
> -                self.string = contents.decode("us-ascii", errors="replace")
> +                self.string = self.bytes.decode("us-ascii", errors="replace")
> +                if valid_encodings:
> +                    self.character_set = "us-ascii"
> +                # If no character encoding, but we find non-ASCII chars, 
> assume bytes were UTF-8
> +                elif len(self.bytes) != len(self.bytes.decode("us-ascii", 
> "ignore")):
> +                    part.set_charset("utf-8")
> +                    self.bytes = part.get_payload(decode=True)
> +                    # Set the .string, but not a character set, as we don't 
> know it for sure.
> +                    # This is mainly so the older generators won't barf.
> +                    self.string = self.bytes.decode("utf-8", "replace")
>
>      def __repr__(self):
>          return self.string
> @@ -200,8 +213,8 @@ class Body:
>      def assign(self, new_string):
>          self.string = new_string
>
> -    def encode(self, charset="utf-8", errors="strict"):
> -        return self.string.encode(charset, errors=errors)
> +    def encode(self, encoding="utf-8", errors="strict"):
> +        return self.string.encode(encoding=encoding, errors=errors)
>
>      def unflow(self, convert_lf=False):
>          """Unflows text of type format=flowed.
> @@ -405,7 +418,12 @@ class Archiver(object):  # N.B. Also used by 
> import-mbox.py
>                  if generator:
>                      try:
>                          mid = plugins.generators.generate(
> -                            generator, msg, body, lid, attachments, raw_msg
> +                            generator,
> +                            msg,
> +                            body if body.character_set else body.bytes,
> +                            lid,
> +                            attachments,
> +                            raw_msg,
>                          )
>                      except Exception as err:
>                          if logger:
> @@ -431,6 +449,7 @@ class Archiver(object):  # N.B. Also used by 
> import-mbox.py
>                      irt = ""
>              all_mids = list(id_set)  # Convert to list
>              document_id = all_mids[0]
> +
>              output_json = {
>                  "from_raw": msg_metadata["from"],
>                  "from": msg_metadata["from"],
> diff --git a/tools/plugins/generators.py b/tools/plugins/generators.py
> index 122633d..79ae9c9 100644
> --- a/tools/plugins/generators.py
> +++ b/tools/plugins/generators.py
> @@ -234,6 +234,8 @@ def medium(msg, body, lid, _attachments, _raw_msg):
>  # as the archived-at may change from node to node (and will change if not in 
> the raw mbox file)
>  # Also the lid is not included in the hash, so the hash does not change if 
> the lid is overridden
>  #
> +
> +
>  def cluster(msg, body, lid, attachments, _raw_msg):
>      """
>      Use data that is guaranteed to be the same across cluster setups
> @@ -268,16 +270,15 @@ def cluster(msg, body, lid, attachments, _raw_msg):
>      # Use text body
>      if not body:  # Make sure body is not None, which will fail.
>          body = ""
> -    xbody = body.encode('utf-8', 'ignore')
> +    xbody = body if type(body) is bytes else body.encode('utf-8', 
> errors='ignore')
>
>      # Crop out any trailing whitespace in body
>      xbody = re.sub(b"\s+$", b"", xbody)
>
>      # Use Message-Id (or '' if missing)
> -    xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')
> +    xbody += bytes(msg.get('message-id', ''), encoding='ascii')
>
>      # Use Date header. Don't use archived-at, as the archiver sets this if 
> not present.
> -    mdate = None
>      mdatestring = "(null)"  # Default to null, ONLY changed if replicable 
> across imports
>      try:
>          mdate = email.utils.parsedate_tz(msg.get('date'))
>

Reply via email to