Hi Alex, and the mupdf maintainers,

The fact that pymupdf is not able to use OCR seem to be related to the fact that mupdf is compiled without support for it, and so I'm reassigning this issue to src:mupdf.

I'm attaching a patch to enable OCR using Tesseract and Leptonica.
This would enable an use via pymudf, amongst other things.

It would mean all mupdf users would fetch tesseract and leptonica, but the data files aren't actually required.

Would you be keep to add such dependencies, to add support for doing OCR? It would enable a set of use cases, especially automatically doing OCR and add a layer of invisible text on top of PDFs built by mupdf.

All the best,
Alexis

PS: That's my first time with the BTS, and so I hope I did things right :-)

------ BEGIN PATCH ------

diff --git a/debian/control b/debian/control
index dff28c5..6c30d6c 100644
--- a/debian/control
+++ b/debian/control
@@ -19,9 +19,11 @@ Build-Depends: afdko-bin:native,
                libjbig2dec0-dev (>= 0.20),
                libjpeg-dev,
                liblcms2-dev,
+               libleptonica-dev,
                libmujs-dev (>= 1.3.8),
                libopenjp2-7-dev,
                libssl-dev,
+               libtesseract-dev,
                libx11-dev,
                libxext-dev,
                libxrandr-dev,
diff --git a/debian/rules b/debian/rules
index a496f69..dfc9c0d 100755
--- a/debian/rules
+++ b/debian/rules
@@ -43,6 +43,10 @@ BUILD_FLAGS += USE_SYSTEM_MUJS=yes
 BUILD_FLAGS += USE_SYSTEM_LIBS=yes
# Force using system lcms2 (was not included in default system library set)
 BUILD_FLAGS += USE_SYSTEM_LCMS2=yes
+# Enable Tesseract OCR support, linking against the system tesseract and
+# leptonica libraries. This appends "-tesseract" to the build suffix,
+# so keep LIB_DIR below in sync.
+BUILD_FLAGS += tesseract=yes USE_SYSTEM_TESSERACT=yes USE_SYSTEM_LEPTONICA=yes
 BUILD_FLAGS += LD=$(DEB_HOST_GNU_TYPE)-ld

 ifeq (,$(filter terse,$(DEB_BUILD_OPTIONS)))
@@ -73,7 +77,7 @@ override_dh_auto_install:
install -m744 -T $(CURDIR)/debian/mupdf.sh $(CURDIR)/debian/tmp/usr/bin/mupdf
        sh debian/install_icons.sh
 ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
- $(MAKE) -C $(CURDIR)/debian/tests test LIB_DIR=$(CURDIR)/build/shared-debug-Py_LIMITED_API_0x030d0000 + $(MAKE) -C $(CURDIR)/debian/tests test LIB_DIR=$(CURDIR)/build/shared-debug-Py_LIMITED_API_0x030d0000-tesseract
 endif

 override_dh_gencontrol:

------ END PATCH ------


-- System Information:
Versions of packages mupdf depends on:
ii  libc6            2.36-9+deb12u14
ii  libfreetype6     2.12.1+dfsg-5+deb12u4
ii  libgl1           1.6.0-1
ii  libglut3.12      3.4.0-1
ii  libgumbo1        0.10.1+dfsg-5
ii  libharfbuzz0b    6.0.0+dfsg-3
ii  libjbig2dec0     0.19-3
ii  libjpeg62-turbo  1:2.1.5-2
ii  libmujs2         1.3.2-1
ii  libopenjp2-7     2.5.0-2+deb12u3
ii  libssl3          3.0.20-1~deb12u2
ii  libx11-6         2:1.8.4-2+deb12u2
ii  libxext6         2:1.3.4-1+b1
ii  zlib1g           1:1.2.13.dfsg-1

mupdf recommends no packages.

Versions of packages mupdf suggests:
pn  mupdf-tools  <none>

-- no debconf information

Reply via email to