---
 doc/Makefile.local          |   2 +-
 doc/examples/filter.py      | 385 ++++++++++++++++++++++++++++++++++++
 doc/man1/notmuch-config.rst |   7 +
 3 files changed, 393 insertions(+), 1 deletion(-)
 create mode 100755 doc/examples/filter.py

diff --git a/doc/Makefile.local b/doc/Makefile.local
index 64e51475..e3c9c879 100644
--- a/doc/Makefile.local
+++ b/doc/Makefile.local
@@ -39,7 +39,7 @@ COPY_INFO1 := $(patsubst 
$(DOCBUILDDIR)/man/man1/%.1,$(DOCBUILDDIR)/texinfo/%.in
 INFO_INFO_FILES := $(INFO_TEXI_FILES:.texi=.info) $(COPY_INFO1)
 
 EXAMPLES_FILES := doc/examples/firejail.profile
-EXAMPLES_FILES_EXEC := doc/examples/bwrap.sh
+EXAMPLES_FILES_EXEC := doc/examples/bwrap.sh doc/examples/filter.py
 
 .PHONY: sphinx-html sphinx-texinfo sphinx-info
 
diff --git a/doc/examples/filter.py b/doc/examples/filter.py
new file mode 100755
index 00000000..52923e84
--- /dev/null
+++ b/doc/examples/filter.py
@@ -0,0 +1,385 @@
+#!/usr/bin/python3
+
+import argparse
+from collections import namedtuple
+from contextlib import ExitStack, closing
+import email
+import email.policy as email_policy
+import logging
+import mimetypes
+import os
+import re
+import shutil
+import subprocess
+import sys
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+
+try:
+    import libarchive
+except ImportError:
+    libarchive = None
+
+try:
+    import magic
+except ImportError:
+    magic = None
+
+MAX_RECURSE_DEFAULT = 8
+
+# archives, extracted with libarchive
+mimetypes_archive = {
+    'application/zip',
+    'application/x-zip',
+    'application/x-zip-compressed',
+    'application/rar',
+    'application/vnd.rar',
+    'application/x-rar',
+    'application/x-rar-compressed',
+    'application/x-7z-compressed',
+    'application/x-tar',
+    'application/x-tar-gz',
+    'application/x-compressed-tar',
+    'application/x-compressed',
+    'application/x-bzip-compressed-tar',
+    'application/x-gtar',
+    'application/x-gtar-compressed',
+    'application/x-tbz',
+    'application/vnd.ms-cab-compressed',
+    'application/x-ace-compressed',
+}
+
+# these may be compressed individual files rather than archives (handled
+# via libarchive's 'raw' format), but sometimes they are mistyped compressed
+# tar archives
+mimetypes_compressed = {
+    'application/gzip',
+    'application/x-gzip',
+    'application/x-gunzip',
+    'application/x-xz',
+    'application/x-bzip',
+    'application/x-bzip2',
+    'application/bzip2',
+    'application/x-bzip',
+}
+
+# types that can be indexed directly as text
+mimetypes_passthrough = {
+    'application/x-patch',
+    'application/x-diff',
+    'application/x-sh',
+    'application/x-shellscript',
+    'application/x-csh',
+    'application/x-ruby',
+    'application/x-tex',
+    'application/x-perl',
+    'application/x-httpd-php',
+    'application/javascript',
+    'application/x-javascript',
+    'application/emacs-lisp',
+    'application/json',
+    'application/x-subrip',
+    'application/x-config',
+    'application/xml',
+    'application/ics',
+    'application/x-po',
+    'application/x-info',
+    'application/x-texinfo',
+    'application/vnd.lotus-organizer',
+}
+
+# generic or unreliable mime types;
+# guess the real type from file contents or filename extension
+mimetypes_guess = {
+    'application/octet-stream',
+    'application/octetstream',
+    'octet/stream',
+    'application/mac-binary',
+    'application/macbinary',
+    'application/text',
+    'application/text-plain',
+    'application/x-download',
+    'application/force-download',
+}
+
+class InputData:
+    mime_type = None
+
+    logger    = None
+
+    # original filename associated with the data
+    # NOT what is returned by get_filename()
+    _orig_filename = None
+
+    _bytes      = None
+    _stream     = None
+    _tempfile   = None
+
+    def __init__(self, log_parent,
+                 bytes = None, stream = None,
+                 filename = None, mime_type = None):
+        if int(bytes is not None) + int(stream is not None) != 1:
+            raise ValueError('Exactly one of bytes/stream must be provided')
+
+        if bytes is not None:
+            self._bytes = bytes
+        elif stream is not None:
+            self._stream = stream
+
+        self._orig_filename = filename
+
+        if mime_type:
+            mime_type = mime_type.lower()
+
+        if mime_type is None or mime_type in mimetypes_guess:
+            mime_type = self._guess_mimetype(filename)
+
+        self.mime_type = mime_type
+
+        self.logger = log_parent.getChild('|' + (filename or self.mime_type) + 
'|')
+
+    def _guess_mimetype(self, filename = None):
+        mime_type = None
+
+        if magic:
+            t = magic.from_buffer(self.bytes(), mime = True)
+
+            if t != 'application/octet-stream':
+                mime_type = t
+
+        if not mime_type and filename:
+            t = mimetypes.guess_file_type(filename, strict = False)
+            if t:
+                mime_type = t[0]
+
+        return mime_type or 'application/octet-stream'
+
+    def get_filename(self):
+        if not self._tempfile:
+            ext = os.path.splitext(self._orig_filename)[1] if 
self._orig_filename else None
+
+            self._tempfile = NamedTemporaryFile(suffix = ext)
+            self._tempfile.write(self.bytes())
+            self._tempfile.flush()
+            os.fsync(self._tempfile.fileno())
+
+        return self._tempfile.name
+
+    def bytes(self):
+        if self._stream:
+            b = self._stream.read()
+            self._stream = None
+            self._bytes = (self._bytes or b'') + b
+        return self._bytes
+
+    def close(self):
+        if self._tempfile:
+            self._tempfile.close()
+            self._tempfile = None
+
+
+Handler = namedtuple('Handler', ('cmdline', 'need_tempdir', 'need_file_in'),
+                     defaults = (False, False))
+handler_map = {
+    'application/(x-)?pdf' : (
+        Handler(cmdline = ('pdftotext', '-', '-')),
+    ),
+    '(application|text)/html' : (
+        Handler(cmdline = ('elinks', '-force-html', '-dump')),
+        Handler(cmdline = ('w3m', '-T', 'text/html', '-dump')),
+    ),
+    r'application/(vnd\.openxmlformats-officedocument\..*|'
+                 r'vnd\.oasis\.opendocument\..*|'
+                 r'vnd\.ms-powerpoint|vnd\.ms-excel|'
+                  '(ms)?word|docx?|rtf)': (
+        Handler(cmdline = ('soffice', 
'-env:UserInstallation=file://{tempdir}', '--cat', '{file_in}'),
+                           need_tempdir = True, need_file_in = True),
+    ),
+}
+
+def get_handler(mime_type):
+    for pat, handlers in handler_map.items():
+        if not re.fullmatch(pat, mime_type):
+            continue
+
+        for handler in handlers:
+            if shutil.which(handler.cmdline[0]):
+                return handler
+
+def handle_default(data, outfile):
+    handler = get_handler(data.mime_type)
+
+    if not handler:
+        if (data.mime_type.startswith('text/') or
+            data.mime_type in mimetypes_passthrough):
+            data.logger.debug('passthrough: %s', data.mime_type)
+            outfile.write(data.bytes())
+            outfile.flush()
+        else:
+            data.logger.debug('no handler, skipping: %s', data.mime_type)
+
+        return
+
+    cmdline = list(handler.cmdline)
+
+    with ExitStack() as stack:
+        tempdir = None
+        if handler.need_tempdir:
+            tempdir = stack.enter_context(TemporaryDirectory())
+
+        args = { 'cwd' : tempdir }
+
+        file_in = None
+        if handler.need_file_in:
+            file_in = data.get_filename()
+        else:
+            args['input'] = data.bytes()
+
+        for i in range(len(cmdline)):
+            cmdline[i] = cmdline[i].format(file_in = file_in, tempdir = 
tempdir)
+
+        data.logger.debug('executing external handler: %s', ' '.join(cmdline))
+
+        ret = subprocess.run(cmdline, stdout = outfile, **args)
+
+def handle_archive(data, outfile, depth, max_depth):
+    # mime types in mimetypes_compressed may be either an archive (a compressed
+    # tarball) or a compressed invididual file;
+    # the following heuristic first tries the former, falling back on the 
latter
+    # on error
+    format_name = 'all'
+    if data.mime_type in mimetypes_compressed:
+        try:
+            with libarchive.memory_reader(data.bytes(), format_name = 
format_name):
+                pass
+        except libarchive.ArchiveError:
+            format_name = 'raw'
+
+    try:
+        with libarchive.memory_reader(data.bytes(),
+                                      format_name = format_name) as archive:
+            for entry in archive:
+                name = None
+                if str(archive.format_name) == 'raw':
+                    # for raw formats the filename is typically
+                    # something like foo.pdf.gz
+                    if filename:
+                        name = os.path.splitext(filename)[0]
+                elif entry.name:
+                    name = entry.name
+                    # this should not happen, but apparently sometimes does
+                    if isinstance(name, bytes):
+                        name = name.decode(errors = 'replace')
+
+                if name:
+                    outfile.write(name.encode(errors = 'replace') + b'\n')
+                    outfile.flush()
+
+                if not entry.isfile:
+                    continue
+
+                blocks = []
+                for b in entry.get_blocks():
+                    # force data copy, as the buffer underlying b may be reused
+                    blocks.append(b[:])
+
+                with closing(InputData(data.logger,
+                                       bytes = b''.join(blocks),
+                                       filename = name)) as data_e:
+                    handle_data(data_e, outfile, depth + 1, max_depth)
+                    outfile.write(b'\n')
+                    outfile.flush()
+    except libarchive.exception.ArchiveError as e:
+        data.logger.error('Error reading archive: %s', str(e))
+
+def handle_email(msg, outfile, depth, max_depth, logger):
+    headers = ('From', 'To', 'Subject', 'Date')
+    for header in headers:
+        val = msg.get(header)
+        if val:
+            line = '%s: %s\n' % (header, val)
+            outfile.write(line.encode(errors = 'replace'))
+            outfile.flush()
+
+    outfile.write(b'\n')
+    outfile.flush()
+
+    for part in msg.walk():
+        if not part.is_multipart():
+            payload = part.get_content()
+            if isinstance(payload, str):
+                payload = payload.encode(errors = 'replace')
+
+            data = InputData(logger,
+                             bytes = payload,
+                             filename = part.get_filename(),
+                             mime_type = part.get_content_type())
+
+            handle_data(data, outfile, depth + 1, max_depth)
+
+def handle_mbox(data, outfile, depth, max_depth):
+    # unfortunately we cannot use python's mailbox handling, because it is
+    # file-based; so instead we roll a very simple mbox parser below
+    lines = data.bytes().decode(errors = 'replace').splitlines()
+
+    def flush_msg(msg_lines):
+        if not msg_lines:
+            return
+
+        msg = email.message_from_string('\r\n'.join(msg_lines),
+                                        policy = email_policy.SMTP)
+        handle_email(msg, outfile, depth, max_depth, data.logger)
+
+    buffered_lines = []
+    for line in lines:
+        if line.startswith('From '):
+            outfile.write(line.encode() + b'\n')
+            outfile.flush()
+            flush_msg(buffered_lines)
+            continue
+
+        # add current line to current-message buffer
+        buffered_lines.append(line)
+
+    flush_msg(buffered_lines)
+
+def handle_data(data, outfile, depth, max_depth):
+    if depth > max_depth:
+        data.logger.error('Maximum recursion depth %d reached, discarding %s',
+                          max_depth, data.mime_type)
+        return False
+
+    if data.mime_type in mimetypes_compressed | mimetypes_archive:
+        if not libarchive:
+            data.logger.error('libarchive not available, cannot handle %s',
+                              data.mime_type)
+            return False
+
+        return handle_archive(data, outfile, depth, max_depth)
+    elif data.mime_type == 'application/mbox':
+        return handle_mbox(data, outfile, depth, max_depth)
+    elif data.mime_type == 'message/rfc822':
+        msg = email.message_from_bytes(data.bytes(), policy = 
email_policy.SMTP)
+        return handle_email(msg, outfile, depth, max_depth, data.logger)
+
+    return handle_default(data, outfile)
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('-v', '--verbose', action = 'count', default = 0)
+parser.add_argument('-q', '--quiet',   action = 'count', default = 0)
+parser.add_argument('-m', '--max-recurse', default = MAX_RECURSE_DEFAULT)
+
+args = parser.parse_args()
+
+msgid     = os.environ.get('NOTMUCH_FILTER_MESSAGE_ID', '<unknown>')
+mime_type = os.environ.get('NOTMUCH_FILTER_MIME_TYPE')
+filename  = os.environ.get('NOTMUCH_FILTER_FILENAME')
+
+logging.basicConfig(level = max(2 - args.verbose + args.quiet, 0) * 10)
+
+outfile = sys.stdout.buffer
+
+with closing(InputData(logging.getLogger(msgid),
+                       stream = sys.stdin.buffer,
+                       filename = filename, mime_type = mime_type)) as data:
+    handle_data(data, outfile, 0, args.max_recurse)
diff --git a/doc/man1/notmuch-config.rst b/doc/man1/notmuch-config.rst
index 3afea860..61ab4280 100644
--- a/doc/man1/notmuch-config.rst
+++ b/doc/man1/notmuch-config.rst
@@ -190,6 +190,13 @@ paths are presumed relative to `$HOME` for items in section
    * NOTMUCH_FILTER_MESSAGE_ID - the message ID, without enclosing angle
      brackets <>
 
+   A sample filtering script is shipped with notmuch documentation (typically 
in
+   ``/usr/share/doc/notmuch``) as ``examples/filter.py``. You may use it as 
e.g.:
+
+   * ``notmuch config set index.as_text text/html application/pdf``
+   * ``notmuch config set index.filter '<your sandbox>
+     <.../contrib/filter/filter.py>'``
+
    History: This configuration value was introduced in notmuch 0.41.
 
 .. nmconfig:: index.decrypt
-- 
2.47.3

_______________________________________________
notmuch mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to