---
doc/Makefile.local | 2 +-
doc/examples/filter.py | 273 ++++++++++++++++++++++++++++++++++++
doc/man1/notmuch-config.rst | 7 +
3 files changed, 281 insertions(+), 1 deletion(-)
create mode 100755 doc/examples/filter.py
diff --git a/doc/Makefile.local b/doc/Makefile.local
index 64e51475..e3c9c879 100644
--- a/doc/Makefile.local
+++ b/doc/Makefile.local
@@ -39,7 +39,7 @@ COPY_INFO1 := $(patsubst
$(DOCBUILDDIR)/man/man1/%.1,$(DOCBUILDDIR)/texinfo/%.in
INFO_INFO_FILES := $(INFO_TEXI_FILES:.texi=.info) $(COPY_INFO1)
EXAMPLES_FILES := doc/examples/firejail.profile
-EXAMPLES_FILES_EXEC := doc/examples/bwrap.sh
+EXAMPLES_FILES_EXEC := doc/examples/bwrap.sh doc/examples/filter.py
.PHONY: sphinx-html sphinx-texinfo sphinx-info
diff --git a/doc/examples/filter.py b/doc/examples/filter.py
new file mode 100755
index 00000000..17f80f8e
--- /dev/null
+++ b/doc/examples/filter.py
@@ -0,0 +1,273 @@
+#!/usr/bin/python3
+
+import argparse
+from collections import namedtuple
+from contextlib import ExitStack, closing
+import mimetypes
+import os
+import re
+import shutil
+import subprocess
+import sys
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+
+try:
+ import libarchive
+except ImportError:
+ libarchive = None
+
+try:
+ import magic
+except ImportError:
+ magic = None
+
+# archives, extracted with libarchive
+mimetypes_archive = {
+ 'application/zip',
+ 'application/x-zip',
+ 'application/x-zip-compressed',
+ 'application/rar',
+ 'application/x-rar',
+ 'application/x-rar-compressed',
+ 'application/x-7z-compressed',
+ 'application/x-tar',
+ 'application/x-compressed-tar',
+ 'application/x-compressed',
+ 'application/x-bzip-compressed-tar',
+ 'application/x-gtar',
+ 'application/x-gtar-compressed',
+ 'application/x-tbz',
+ 'application/vnd.ms-cab-compressed',
+ 'application/x-ace-compressed',
+}
+
+# these may be compressed individual files rather than archives (handled
+# via libarchive's 'raw' format), but sometimes they are mistyped compressed
+# tar archives
+mimetypes_compressed = {
+ 'application/gzip',
+ 'application/x-gzip',
+ 'application/x-gunzip',
+ 'application/x-xz',
+ 'application/x-bzip',
+ 'application/x-bzip2',
+ 'application/bzip2',
+ 'application/x-bzip',
+}
+
+# types that can be indexed directly as text
+mimetypes_passthrough = {
+ 'application/x-patch',
+ 'application/x-diff',
+ 'application/x-sh',
+ 'application/x-csh',
+ 'application/x-ruby',
+ 'application/x-tex',
+ 'application/x-perl',
+ 'application/x-httpd-php',
+ 'application/x-javascript',
+ 'application/emacs-lisp',
+ 'application/json',
+ 'application/x-subrip',
+ 'application/x-config',
+ 'application/xml',
+ 'application/ics',
+ 'application/x-po',
+ 'application/x-info',
+ 'application/vnd.lotus-organizer',
+}
+
+# generic or unreliable mime types;
+# guess the real type from file contents or filename extension
+mimetypes_guess = {
+ 'application/octet-stream',
+ 'application/octetstream',
+ 'octet/stream',
+ 'application/mac-binary',
+ 'application/macbinary',
+ 'application/text',
+ 'application/text-plain',
+ 'application/x-download',
+ 'application/force-download',
+}
+
+class InputData:
+ mime_type = None
+
+ _bytes = None
+ _stream = None
+ _tempfile = None
+
+ def __init__(self, bytes = None, stream = None,
+ filename = None, mime_type = None):
+ if int(bytes is not None) + int(stream is not None) != 1:
+ raise ValueError('Exactly one of bytes/stream must be provided')
+
+ if bytes is not None:
+ self._bytes = bytes
+ elif stream is not None:
+ self._stream = stream
+
+ if mime_type is None or mime_type in mimetypes_guess:
+ mime_type = self._guess_mimetype(filename)
+
+ self.mime_type = mime_type
+
+ def _guess_mimetype(self, filename = None):
+ mime_type = None
+
+ if magic:
+ t = magic.from_buffer(self.bytes(), mime = True)
+ if t != 'application/octet-stream':
+ mime_type = t
+
+ if not mime_type and filename:
+ t = mimetypes.guess_file_type(filename, strict = False)
+ if t:
+ mime_type = t[0]
+
+ return mime_type or 'application/octet-stream'
+
+ def get_filename(self):
+ if not self._tempfile:
+ self._tempfile = NamedTemporaryFile()
+ self._tempfile.write(self.bytes())
+ self._tempfile.flush()
+ os.fsync(self._tempfile.fileno())
+
+ return self._tempfile.name
+
+ def bytes(self):
+ if self._stream:
+ b = self._stream.read()
+ self._stream = None
+ self._bytes = (self._bytes or b'') + b
+ return self._bytes
+
+ def close(self):
+ if self._tempfile:
+ self._tempfile.close()
+ self._tempfile = None
+
+
+Handler = namedtuple('Handler', ('cmdline', 'need_tempdir', 'need_file_in'),
+ defaults = (False, False))
+handler_map = {
+ 'application/(x-)?pdf' : (
+ Handler(cmdline = ('pdftotext', '-', '-')),
+ ),
+ '(application|text)/html' : (
+ Handler(cmdline = ('elinks', '-force-html', '-dump')),
+ Handler(cmdline = ('w3m', '-T', 'text/html', '-dump')),
+ ),
+ r'application/(vnd\.openxmlformats-officedocument\..*|'
+ r'vnd\.oasis\.opendocument\..*|'
+ r'vnd\.ms-powerpoint|vnd\.ms-excel|'
+ '(ms)?word|docx?|rtf)': (
+ Handler(cmdline = ('soffice',
'-env:UserInstallation=file://{tempdir}', '--cat', '{file_in}'),
+ need_tempdir = True, need_file_in = True),
+ ),
+}
+
+def get_handler(mime_type):
+ for pat, handlers in handler_map.items():
+ if not re.fullmatch(pat, mime_type):
+ continue
+
+ for handler in handlers:
+ if shutil.which(handler.cmdline[0]):
+ return handler
+
+def handle_file(data, outfile):
+ handler = get_handler(data.mime_type)
+
+ if not handler:
+ if (data.mime_type.startswith('text/') or
+ data.mime_type in mimetypes_passthrough):
+ outfile.write(data.bytes())
+ outfile.flush()
+
+ return
+
+ cmdline = list(handler.cmdline)
+
+ with ExitStack() as stack:
+ tempdir = None
+ if handler.need_tempdir:
+ tempdir = stack.enter_context(TemporaryDirectory())
+
+ args = { 'cwd' : tempdir }
+
+ file_in = None
+ if handler.need_file_in:
+ file_in = data.get_filename()
+ else:
+ args['input'] = data.bytes()
+
+ for i in range(len(cmdline)):
+ cmdline[i] = cmdline[i].format(file_in = file_in, tempdir =
tempdir)
+
+ subprocess.run(cmdline, stdout = outfile, **args)
+
+
+parser = argparse.ArgumentParser()
+
+args = parser.parse_args()
+
+mime_type = os.environ.get('NOTMUCH_FILTER_MIME_TYPE')
+filename = os.environ.get('NOTMUCH_FILTER_FILENAME')
+
+outfile = sys.stdout.buffer
+
+with closing(InputData(stream = sys.stdin.buffer,
+ filename = filename, mime_type = mime_type)) as data:
+ if data.mime_type in mimetypes_compressed | mimetypes_archive:
+ if not libarchive:
+ print('libarchive not available, cannot handle %s' %
data.mime_type,
+ file = sys.stderr)
+ sys.exit(1)
+
+ # mime types in mimetypes_compressed may be either an archive
(compressed tar)
+ # or a compressed invidivual file;
+ # the following heuristic first tries the former, falling back on the
latter
+ # on error
+ format_name = 'all'
+ if data.mime_type in mimetypes_compressed:
+ try:
+ with libarchive.memory_reader(data.bytes(), format_name =
format_name) as archive:
+ pass
+ except libarchive.ArchiveError:
+ format_name = 'raw'
+
+ with libarchive.memory_reader(data.bytes(), format_name = format_name)
as archive:
+ for entry in archive:
+ if not entry.isfile:
+ continue
+
+ name = None
+ if str(archive.format_name) == 'raw':
+ # for raw formats the filename is typically
+ # something like foo.pdf.gz
+ if filename:
+ name = os.path.splitext(filename)[0]
+ elif entry.name:
+ name = entry.name
+ if isinstance(name, bytes):
+ name = name.decode(errors = 'replace')
+
+ if name:
+ outfile.write(name.encode(errors = 'replace') + b'\n')
+ outfile.flush()
+
+ blocks = []
+ for b in entry.get_blocks():
+ # force data copy, as the buffer underlying b may be reused
+ blocks.append(b[:])
+
+ with closing(InputData(bytes = b''.join(blocks),
+ filename = name)) as data_e:
+ handle_file(data_e, outfile)
+ outfile.write(b'\n')
+ outfile.flush()
+ else:
+ handle_file(data, outfile)
diff --git a/doc/man1/notmuch-config.rst b/doc/man1/notmuch-config.rst
index 935956fd..29a2fd1d 100644
--- a/doc/man1/notmuch-config.rst
+++ b/doc/man1/notmuch-config.rst
@@ -190,6 +190,13 @@ paths are presumed relative to `$HOME` for items in section
* NOTMUCH_FILTER_MESSAGE_ID - the message ID, without enclosing angle
brackets <>
+ A sample filtering script is shipped with notmuch documentation (typically
in
+ ``/usr/share/doc/notmuch``) as ``examples/filter.py``. You may use it as
e.g.:
+
+ * ``notmuch config set index.as_text text/html application/pdf``
+ * ``notmuch config set index.filter '<your sandbox>
+ <.../contrib/filter/filter.py>'``
+
History: This configuration value was introduced in notmuch 0.40.
.. nmconfig:: index.decrypt
--
2.47.3
_______________________________________________
notmuch mailing list -- [email protected]
To unsubscribe send an email to [email protected]