84: Implement language customization for OCR

ELPA Syncer Mon, 26 Sep 2022 11:11:53 -0700

branch: externals/doc-toc
commit ae455b486327205e5e6df786d42af188bddd8fa5
Author: Daniel Nicolai <dalanico...@gmail.com>
Commit: Daniel Nicolai <dalanico...@gmail.com>


    Implement language customization for OCR
---
 README.org  |  3 ++-
 toc-mode.el | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/README.org b/README.org
index 8e268b752e..86a38c2d93 100644
--- a/README.org
+++ b/README.org
@@ -45,7 +45,8 @@ toc-extract-pages-ocr= if doc has no text layer or text layer 
is bad, and answer
 the subsequent prompts by entering the pagenumbers for the first and the last
 page each followed by =RET=. *For PDF extraction with OCR, currently it is 
required*
 *to view all contents pages once before extraction* (toc-mode uses the cached 
file
-data).
+data). Also the languages used for tesseract OCR can be customized via the
+`toc-ocr-languages' variable.
 
 [[toc-mode-extract.gif]]
 
diff --git a/toc-mode.el b/toc-mode.el
index 3ce807deb5..d2dbfa279a 100644
--- a/toc-mode.el
+++ b/toc-mode.el
@@ -48,11 +48,12 @@
 ;; text layer is bad, and answer the subsequent prompts by entering the
 ;; pagenumbers for the first and the last page each followed by RET. For PDF
 ;; extraction with OCR, currently it is required to view all contents pages 
once
-;; before extraction (toc-mode uses the cached file data). A buffer with the,
-;; somewhat cleaned up, extracted text will open in TOC-cleanup mode. Prefix
-;; command with the universal argument (C-u) to omit clean and get the raw 
text.
-;; 2. TOC-Cleanup In this mode you can further cleanup the contents to create a
-;; list where each line has the structure:
+;; before extraction (toc-mode uses the cached file data). Also the languages
+;; used for tesseract OCR can be customized via the `toc-ocr-languages'
+;; variable. A buffer with the, somewhat cleaned up, extracted text will open 
in
+;; TOC-cleanup mode. Prefix command with the universal argument (C-u) to omit
+;; clean and get the raw text. 2. TOC-Cleanup In this mode you can further
+;; cleanup the contents to create a list where each line has the structure:
 
 ;; TITLE (SOME) PAGENUMBER
 
@@ -167,6 +168,14 @@ For DJVU the old DJVU file is replaced by default"
   :type 'file
   :group 'toc)
 
+(defcustom toc-ocr-languages nil
+  "Languages used for extraction with ocr.
+Should be one or multiple language codes as recognized
+by tesseract -l flag, e.g. eng or eng+nld. Use
+\\[execute-extended-command] `toc-list-languages' to list the
+available languages."
+  :type 'string
+  :group 'toc)
 ;;;; toc-extract and cleanup
 
 ;;; toc-cleanup
@@ -311,6 +320,15 @@ text."
             (toc--cleanup startpage)))
       (message "Buffer not in pdf-view- or djvu-read-mode"))))
 
+(defun toc-list-languages ()
+  "List languages available for ocr.
+For use in `toc-ocr-languages'."
+  (interactive)
+  (let ((print-length nil))
+    (message (format "%s" (seq-subseq
+                         (split-string
+                          (shell-command-to-string "tesseract --list-langs"))
+                         5)))))
 
 ;;;###autoload
 (defun toc-extract-pages-ocr (arg)
@@ -327,7 +345,10 @@ unprocessed text."
                          (read-string "Enter end-pagenumber for extraction: 
")))
                (source-buffer (current-buffer))
                (ext (url-file-extension (buffer-file-name (current-buffer))))
-               (buffer (file-name-sans-extension (buffer-name))))
+               (buffer (file-name-sans-extension (buffer-name)))
+               (args (list "stdout" "--psm" "6")))
+          (when toc-ocr-languages
+            (setq args (append args (list "-l" toc-ocr-languages))))
           (while (<= page (+ endpage))
             (let ((file (cond ((string= ".pdf" ext)
                                (make-temp-file "pageimage"
@@ -340,7 +361,9 @@ unprocessed text."
                                                nil
                                                (number-to-string page)
                                                (image-property djvu-doc-image 
:data))))))
-              (call-process "tesseract" nil (list buffer nil) nil file 
"stdout" "--psm" "6")
+              (apply 'call-process
+                     (append (list "tesseract" nil (list buffer nil) nil file)
+                             args))
               (setq page (1+ page))))
           (switch-to-buffer buffer)
           (toc-cleanup-mode) ;; required before setting local variable

[elpa] externals/doc-toc ae455b4863 52/84: Implement language customization for OCR

Reply via email to