This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new a2b5ca8f1 TIKA-4439 -- this is actually a slight regression. :(
     new 66824b3fc Merge remote-tracking branch 'origin/branch_3x' into 
branch_3x
a2b5ca8f1 is described below

commit a2b5ca8f111f66f7f9d13c661eec9f41d8fb52d2
Author: tallison <[email protected]>
AuthorDate: Thu Jun 19 09:28:06 2025 -0400

    TIKA-4439 -- this is actually a slight regression. :(
---
 .../apache/tika/parser/microsoft/EMFParser.java    | 91 ++++++++++++++++------
 1 file changed, 69 insertions(+), 22 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
index 7317b6fba..72754382f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
@@ -27,6 +27,7 @@ import java.util.Set;
 import java.util.function.Supplier;
 
 import org.apache.poi.hemf.record.emf.HemfComment;
+import org.apache.poi.hemf.record.emf.HemfMisc;
 import org.apache.poi.hemf.record.emf.HemfRecord;
 import org.apache.poi.hemf.record.emf.HemfRecordType;
 import org.apache.poi.hemf.record.emf.HemfText;
@@ -112,16 +113,24 @@ public class EMFParser implements Parser {
 
             //NOTE that we're just scraping the text out in storage order. The 
proper way to do this
             //is to sort the text records by x,y like we do for PDFs and xps
+
+            HemfMisc.EmfModifyWorldTransform lastModifyWorldTransform = null;
             for (HemfRecord record : ex) {
                 parseState.isIconOnly = false;
                 if (record.getEmfRecordType() == HemfRecordType.comment) {
                     handleCommentData(
                             ((HemfComment.EmfComment) 
record).getCommentData(), parseState, xhtml, context);
                 } else if 
(record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) {
-                    handleExtTextOut((HemfText.EmfExtTextOutW) record, 
parseState, buffer, xhtml, fudgeFactorX, StandardCharsets.UTF_16LE);
+                    handleExtTextOut((HemfText.EmfExtTextOutW) record, 
lastModifyWorldTransform,
+                            parseState, buffer, xhtml, fudgeFactorX, 
StandardCharsets.UTF_16LE);
+                    lastModifyWorldTransform = null;
                 } else if 
(record.getEmfRecordType().equals(HemfRecordType.extTextOutA)) {
                     //do something better than assigning utf8.
-                    handleExtTextOut((HemfText.EmfExtTextOutA) record, 
parseState, buffer, xhtml, fudgeFactorX, StandardCharsets.UTF_8);
+                    handleExtTextOut((HemfText.EmfExtTextOutA) record, 
lastModifyWorldTransform,
+                            parseState, buffer, xhtml, fudgeFactorX, 
StandardCharsets.UTF_8);
+                    lastModifyWorldTransform = null;
+                } else if 
(record.getEmfRecordType().equals(HemfRecordType.modifyWorldTransform)) {
+                    lastModifyWorldTransform = 
(HemfMisc.EmfModifyWorldTransform) record;
                 }
 
                 if (parseState.isIconOnly) {
@@ -149,45 +158,55 @@ public class EMFParser implements Parser {
         xhtml.endDocument();
     }
 
-    private void handleExtTextOut(HemfText.EmfExtTextOutA record, ParseState 
parseState,
+    private void handleExtTextOut(HemfText.EmfExtTextOutA textRecord, 
HemfMisc.EmfModifyWorldTransform lastModifyWorldTransform,
+                                  ParseState parseState,
                                   StringBuilder buffer, XHTMLContentHandler 
xhtml, double fudgeFactorX,
                                   Charset charset) throws IOException, 
SAXException {
-        Rectangle2D currRectangle = getCurrentRectangle(record);
-        if (parseState.lastRectangle.getY() > -1 &&
-                deltaGreaterThan(parseState.lastRectangle.getMinY(), 
currRectangle.getMinY(), 0.0001)) {
-            xhtml.startElement("p");
-            xhtml.characters(buffer.toString());
-            xhtml.endElement("p");
-            buffer.setLength(0);
-        } else if (parseState.lastRectangle.getX() > -1 &&
-                deltaGreaterThan(currRectangle.getMinX(),
-                        parseState.lastRectangle.getMaxX(), fudgeFactorX)) {
+        Rectangle2D currRectangle = getCurrentRectangle(textRecord, 
lastModifyWorldTransform);
+        double yFudge = getYFudge(parseState.lastRectangle, currRectangle);
+        //if the currRectangle is vaguely reasonable, do the math
+        if (gteZero(currRectangle) && notZero(currRectangle, 0.00001)) {
+            if (parseState.lastRectangle.getY() > -1 && 
deltaGreaterThan(parseState.lastRectangle.getMinY(), currRectangle.getMinY(), 
yFudge)) {
+                xhtml.startElement("p");
+                xhtml.characters(buffer.toString());
+                xhtml.endElement("p");
+                buffer.setLength(0);
+            } else if (parseState.lastRectangle.getX() > -1 && 
deltaGreaterThan(currRectangle.getMinX(), parseState.lastRectangle.getMaxX(), 
fudgeFactorX)) {
+                buffer.append(" ");
+            }
+        } else {
+            //currRectangle was not vaguely reasonable, interpolate a space 
and hope for the best
             buffer.append(" ");
         }
         //do something better than this
-        String txt = record.getText(charset);
+        String txt = textRecord.getText(charset);
         buffer.append(txt);
         parseState.lastRectangle = currRectangle;
 
     }
 
+    private double getYFudge(Rectangle2D lastRectangle, Rectangle2D 
currRectangle) {
+        if (lastRectangle.getHeight() >= 1 && currRectangle.getHeight() >= 
1.0) {
+            return 0.1 * Math.max(lastRectangle.getHeight(), 
currRectangle.getHeight());
+        }
+        return 0.1;
+    }
+
     private boolean deltaGreaterThan(double a, double b, double delta) {
         return (Math.abs(a - b) > delta);
     }
 
-    private Rectangle2D getCurrentRectangle(HemfText.EmfExtTextOutA 
extTextOutA) {
+    private Rectangle2D getCurrentRectangle(HemfText.EmfExtTextOutA 
extTextOutA,
+                                            HemfMisc.EmfModifyWorldTransform 
lastModifyWorldTransform) {
         //This gets the current rectangle out of the emfextTextOutA record.
         //via TIKA-4432, if the rectangle is 0,0,0,0 then back-off to the 
bounds ignored, if those exist
 
-        //TODO: maybe use modifyWorldTransform and calculate font width etc...
         Rectangle2D bounds = extTextOutA.getBounds();
-        double smidge = 0.000000001;
-        if (deltaGreaterThan(bounds.getX(), 0.0d, smidge) ||
-                deltaGreaterThan(bounds.getY(), 0.0d, smidge) ||
-                deltaGreaterThan(bounds.getWidth(), 0.0d, smidge) ||
-                deltaGreaterThan(bounds.getHeight(), 0.0d, smidge)) {
+        double smidge = 0.00001;
+        if (notZero(bounds, smidge) && gteZero(bounds)) {
             return bounds;
         }
+        //if that didn't work, fall back to boundsIgnored
         Supplier<?> boundsIgnored = 
extTextOutA.getGenericProperties().get("boundsIgnored");
         if (boundsIgnored == null) {
             return bounds;
@@ -199,7 +218,35 @@ public class EMFParser implements Parser {
         if (! (maybeBounds instanceof Rectangle2D)) {
             return bounds;
         }
-        return (Rectangle2D) maybeBounds;
+        Rectangle2D ret = (Rectangle2D) maybeBounds;
+        if (notZero(ret, smidge) && gteZero(ret)) {
+            return ret;
+        }
+        //if that didn't work fall back to the lastModifyWorldTransform if it 
is not null
+        if (lastModifyWorldTransform == null) {
+            return bounds;
+        }
+
+        if (lastModifyWorldTransform.getXForm().getTranslateX() > 0.0 &&
+                lastModifyWorldTransform.getXForm().getTranslateY() > 0.0) {
+            return new 
Rectangle2D.Double(lastModifyWorldTransform.getXForm().getTranslateX(),
+                    lastModifyWorldTransform.getXForm().getTranslateY(), 10, 
10);
+        }
+        return bounds;
+    }
+
+    private boolean notZero(Rectangle2D bounds, double smidge) {
+        //require that at least one value is > 0
+        return deltaGreaterThan(bounds.getMinX(), 0.0d, smidge)
+                || deltaGreaterThan(bounds.getMaxX(), 0.0, smidge)
+                || deltaGreaterThan(bounds.getMinY(), 0.0, smidge)
+                || deltaGreaterThan(bounds.getMaxY(), 0.0, smidge);
+    }
+
+    private boolean gteZero(Rectangle2D bounds) {
+        //require that there be no negative coordinates
+        return bounds.getMinX() >= 0.0 && bounds.getMaxX() >= 0.0 &&
+                bounds.getMinY() >= 0.0 && bounds.getMaxY() >= 0.0;
     }
 
     private void handleCommentData(

Reply via email to