---
 doc/Makefile.local          |   2 +-
 doc/examples/filter.py      | 273 ++++++++++++++++++++++++++++++++++++
 doc/man1/notmuch-config.rst |   7 +
 3 files changed, 281 insertions(+), 1 deletion(-)
 create mode 100755 doc/examples/filter.py

diff --git a/doc/Makefile.local b/doc/Makefile.local
index 64e51475..e3c9c879 100644
--- a/doc/Makefile.local
+++ b/doc/Makefile.local
@@ -39,7 +39,7 @@ COPY_INFO1 := $(patsubst 
$(DOCBUILDDIR)/man/man1/%.1,$(DOCBUILDDIR)/texinfo/%.in
 INFO_INFO_FILES := $(INFO_TEXI_FILES:.texi=.info) $(COPY_INFO1)
 
 EXAMPLES_FILES := doc/examples/firejail.profile
-EXAMPLES_FILES_EXEC := doc/examples/bwrap.sh
+EXAMPLES_FILES_EXEC := doc/examples/bwrap.sh doc/examples/filter.py
 
 .PHONY: sphinx-html sphinx-texinfo sphinx-info
 
diff --git a/doc/examples/filter.py b/doc/examples/filter.py
new file mode 100755
index 00000000..17f80f8e
--- /dev/null
+++ b/doc/examples/filter.py
@@ -0,0 +1,273 @@
+#!/usr/bin/python3
+
+import argparse
+from collections import namedtuple
+from contextlib import ExitStack, closing
+import mimetypes
+import os
+import re
+import shutil
+import subprocess
+import sys
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+
+try:
+    import libarchive
+except ImportError:
+    libarchive = None
+
+try:
+    import magic
+except ImportError:
+    magic = None
+
+# archives, extracted with libarchive
+mimetypes_archive = {
+    'application/zip',
+    'application/x-zip',
+    'application/x-zip-compressed',
+    'application/rar',
+    'application/x-rar',
+    'application/x-rar-compressed',
+    'application/x-7z-compressed',
+    'application/x-tar',
+    'application/x-compressed-tar',
+    'application/x-compressed',
+    'application/x-bzip-compressed-tar',
+    'application/x-gtar',
+    'application/x-gtar-compressed',
+    'application/x-tbz',
+    'application/vnd.ms-cab-compressed',
+    'application/x-ace-compressed',
+}
+
+# these may be compressed individual files rather than archives (handled
+# via libarchive's 'raw' format), but sometimes they are mistyped compressed
+# tar archives
+mimetypes_compressed = {
+    'application/gzip',
+    'application/x-gzip',
+    'application/x-gunzip',
+    'application/x-xz',
+    'application/x-bzip',
+    'application/x-bzip2',
+    'application/bzip2',
+    'application/x-bzip',
+}
+
+# types that can be indexed directly as text
+mimetypes_passthrough = {
+    'application/x-patch',
+    'application/x-diff',
+    'application/x-sh',
+    'application/x-csh',
+    'application/x-ruby',
+    'application/x-tex',
+    'application/x-perl',
+    'application/x-httpd-php',
+    'application/x-javascript',
+    'application/emacs-lisp',
+    'application/json',
+    'application/x-subrip',
+    'application/x-config',
+    'application/xml',
+    'application/ics',
+    'application/x-po',
+    'application/x-info',
+    'application/vnd.lotus-organizer',
+}
+
+# generic or unreliable mime types;
+# guess the real type from file contents or filename extension
+mimetypes_guess = {
+    'application/octet-stream',
+    'application/octetstream',
+    'octet/stream',
+    'application/mac-binary',
+    'application/macbinary',
+    'application/text',
+    'application/text-plain',
+    'application/x-download',
+    'application/force-download',
+}
+
+class InputData:
+    mime_type = None
+
+    _bytes      = None
+    _stream     = None
+    _tempfile   = None
+
+    def __init__(self, bytes = None, stream = None,
+                 filename = None, mime_type = None):
+        if int(bytes is not None) + int(stream is not None) != 1:
+            raise ValueError('Exactly one of bytes/stream must be provided')
+
+        if bytes is not None:
+            self._bytes = bytes
+        elif stream is not None:
+            self._stream = stream
+
+        if mime_type is None or mime_type in mimetypes_guess:
+            mime_type = self._guess_mimetype(filename)
+
+        self.mime_type = mime_type
+
+    def _guess_mimetype(self, filename = None):
+        mime_type = None
+
+        if magic:
+            t = magic.from_buffer(self.bytes(), mime = True)
+            if t != 'application/octet-stream':
+                mime_type = t
+
+        if not mime_type and filename:
+            t = mimetypes.guess_file_type(filename, strict = False)
+            if t:
+                mime_type = t[0]
+
+        return mime_type or 'application/octet-stream'
+
+    def get_filename(self):
+        if not self._tempfile:
+            self._tempfile = NamedTemporaryFile()
+            self._tempfile.write(self.bytes())
+            self._tempfile.flush()
+            os.fsync(self._tempfile.fileno())
+
+        return self._tempfile.name
+
+    def bytes(self):
+        if self._stream:
+            b = self._stream.read()
+            self._stream = None
+            self._bytes = (self._bytes or b'') + b
+        return self._bytes
+
+    def close(self):
+        if self._tempfile:
+            self._tempfile.close()
+            self._tempfile = None
+
+
+Handler = namedtuple('Handler', ('cmdline', 'need_tempdir', 'need_file_in'),
+                     defaults = (False, False))
+handler_map = {
+    'application/(x-)?pdf' : (
+        Handler(cmdline = ('pdftotext', '-', '-')),
+    ),
+    '(application|text)/html' : (
+        Handler(cmdline = ('elinks', '-force-html', '-dump')),
+        Handler(cmdline = ('w3m', '-T', 'text/html', '-dump')),
+    ),
+    r'application/(vnd\.openxmlformats-officedocument\..*|'
+                 r'vnd\.oasis\.opendocument\..*|'
+                 r'vnd\.ms-powerpoint|vnd\.ms-excel|'
+                  '(ms)?word|docx?|rtf)': (
+        Handler(cmdline = ('soffice', 
'-env:UserInstallation=file://{tempdir}', '--cat', '{file_in}'),
+                           need_tempdir = True, need_file_in = True),
+    ),
+}
+
+def get_handler(mime_type):
+    for pat, handlers in handler_map.items():
+        if not re.fullmatch(pat, mime_type):
+            continue
+
+        for handler in handlers:
+            if shutil.which(handler.cmdline[0]):
+                return handler
+
+def handle_file(data, outfile):
+    handler = get_handler(data.mime_type)
+
+    if not handler:
+        if (data.mime_type.startswith('text/') or
+            data.mime_type in mimetypes_passthrough):
+            outfile.write(data.bytes())
+            outfile.flush()
+
+        return
+
+    cmdline = list(handler.cmdline)
+
+    with ExitStack() as stack:
+        tempdir = None
+        if handler.need_tempdir:
+            tempdir = stack.enter_context(TemporaryDirectory())
+
+        args = { 'cwd' : tempdir }
+
+        file_in = None
+        if handler.need_file_in:
+            file_in = data.get_filename()
+        else:
+            args['input'] = data.bytes()
+
+        for i in range(len(cmdline)):
+            cmdline[i] = cmdline[i].format(file_in = file_in, tempdir = 
tempdir)
+
+        subprocess.run(cmdline, stdout = outfile, **args)
+
+
+parser = argparse.ArgumentParser()
+
+args = parser.parse_args()
+
+mime_type = os.environ.get('NOTMUCH_FILTER_MIME_TYPE')
+filename  = os.environ.get('NOTMUCH_FILTER_FILENAME')
+
+outfile = sys.stdout.buffer
+
+with closing(InputData(stream = sys.stdin.buffer,
+                       filename = filename, mime_type = mime_type)) as data:
+    if data.mime_type in mimetypes_compressed | mimetypes_archive:
+        if not libarchive:
+            print('libarchive not available, cannot handle %s' % 
data.mime_type,
+                  file = sys.stderr)
+            sys.exit(1)
+
+        # mime types in mimetypes_compressed may be either an archive 
(compressed tar)
+        # or a compressed invidivual file;
+        # the following heuristic first tries the former, falling back on the 
latter
+        # on error
+        format_name = 'all'
+        if data.mime_type in mimetypes_compressed:
+            try:
+                with libarchive.memory_reader(data.bytes(), format_name = 
format_name) as archive:
+                    pass
+            except libarchive.ArchiveError:
+                format_name = 'raw'
+
+        with libarchive.memory_reader(data.bytes(), format_name = format_name) 
as archive:
+            for entry in archive:
+                if not entry.isfile:
+                    continue
+
+                name = None
+                if str(archive.format_name) == 'raw':
+                    # for raw formats the filename is typically
+                    # something like foo.pdf.gz
+                    if filename:
+                        name = os.path.splitext(filename)[0]
+                elif entry.name:
+                    name = entry.name
+                    if isinstance(name, bytes):
+                        name = name.decode(errors = 'replace')
+
+                if name:
+                    outfile.write(name.encode(errors = 'replace') + b'\n')
+                    outfile.flush()
+
+                blocks = []
+                for b in entry.get_blocks():
+                    # force data copy, as the buffer underlying b may be reused
+                    blocks.append(b[:])
+
+                with closing(InputData(bytes = b''.join(blocks),
+                                       filename = name)) as data_e:
+                    handle_file(data_e, outfile)
+                    outfile.write(b'\n')
+                    outfile.flush()
+    else:
+        handle_file(data, outfile)
diff --git a/doc/man1/notmuch-config.rst b/doc/man1/notmuch-config.rst
index 935956fd..29a2fd1d 100644
--- a/doc/man1/notmuch-config.rst
+++ b/doc/man1/notmuch-config.rst
@@ -190,6 +190,13 @@ paths are presumed relative to `$HOME` for items in section
    * NOTMUCH_FILTER_MESSAGE_ID - the message ID, without enclosing angle
      brackets <>
 
+   A sample filtering script is shipped with notmuch documentation (typically 
in
+   ``/usr/share/doc/notmuch``) as ``examples/filter.py``. You may use it as 
e.g.:
+
+   * ``notmuch config set index.as_text text/html application/pdf``
+   * ``notmuch config set index.filter '<your sandbox>
+     <.../contrib/filter/filter.py>'``
+
    History: This configuration value was introduced in notmuch 0.40.
 
 .. nmconfig:: index.decrypt
-- 
2.47.3

_______________________________________________
notmuch mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to