This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new bfb0d9d42 Revert "TIKA-4439 -- this is actually a slight regression.
:("
bfb0d9d42 is described below
commit bfb0d9d42359c9a7bf4de7787be8102d78e1c9d3
Author: tallison <[email protected]>
AuthorDate: Thu Jun 19 09:32:30 2025 -0400
Revert "TIKA-4439 -- this is actually a slight regression. :("
This reverts commit a2b5ca8f111f66f7f9d13c661eec9f41d8fb52d2.
---
.../apache/tika/parser/microsoft/EMFParser.java | 91 ++++++----------------
1 file changed, 22 insertions(+), 69 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
index 72754382f..7317b6fba 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
@@ -27,7 +27,6 @@ import java.util.Set;
import java.util.function.Supplier;
import org.apache.poi.hemf.record.emf.HemfComment;
-import org.apache.poi.hemf.record.emf.HemfMisc;
import org.apache.poi.hemf.record.emf.HemfRecord;
import org.apache.poi.hemf.record.emf.HemfRecordType;
import org.apache.poi.hemf.record.emf.HemfText;
@@ -113,24 +112,16 @@ public class EMFParser implements Parser {
//NOTE that we're just scraping the text out in storage order. The
proper way to do this
//is to sort the text records by x,y like we do for PDFs and xps
-
- HemfMisc.EmfModifyWorldTransform lastModifyWorldTransform = null;
for (HemfRecord record : ex) {
parseState.isIconOnly = false;
if (record.getEmfRecordType() == HemfRecordType.comment) {
handleCommentData(
((HemfComment.EmfComment)
record).getCommentData(), parseState, xhtml, context);
} else if
(record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) {
- handleExtTextOut((HemfText.EmfExtTextOutW) record,
lastModifyWorldTransform,
- parseState, buffer, xhtml, fudgeFactorX,
StandardCharsets.UTF_16LE);
- lastModifyWorldTransform = null;
+ handleExtTextOut((HemfText.EmfExtTextOutW) record,
parseState, buffer, xhtml, fudgeFactorX, StandardCharsets.UTF_16LE);
} else if
(record.getEmfRecordType().equals(HemfRecordType.extTextOutA)) {
//do something better than assigning utf8.
- handleExtTextOut((HemfText.EmfExtTextOutA) record,
lastModifyWorldTransform,
- parseState, buffer, xhtml, fudgeFactorX,
StandardCharsets.UTF_8);
- lastModifyWorldTransform = null;
- } else if
(record.getEmfRecordType().equals(HemfRecordType.modifyWorldTransform)) {
- lastModifyWorldTransform =
(HemfMisc.EmfModifyWorldTransform) record;
+ handleExtTextOut((HemfText.EmfExtTextOutA) record,
parseState, buffer, xhtml, fudgeFactorX, StandardCharsets.UTF_8);
}
if (parseState.isIconOnly) {
@@ -158,55 +149,45 @@ public class EMFParser implements Parser {
xhtml.endDocument();
}
- private void handleExtTextOut(HemfText.EmfExtTextOutA textRecord,
HemfMisc.EmfModifyWorldTransform lastModifyWorldTransform,
- ParseState parseState,
+ private void handleExtTextOut(HemfText.EmfExtTextOutA record, ParseState
parseState,
StringBuilder buffer, XHTMLContentHandler
xhtml, double fudgeFactorX,
Charset charset) throws IOException,
SAXException {
- Rectangle2D currRectangle = getCurrentRectangle(textRecord,
lastModifyWorldTransform);
- double yFudge = getYFudge(parseState.lastRectangle, currRectangle);
- //if the currRectangle is vaguely reasonable, do the math
- if (gteZero(currRectangle) && notZero(currRectangle, 0.00001)) {
- if (parseState.lastRectangle.getY() > -1 &&
deltaGreaterThan(parseState.lastRectangle.getMinY(), currRectangle.getMinY(),
yFudge)) {
- xhtml.startElement("p");
- xhtml.characters(buffer.toString());
- xhtml.endElement("p");
- buffer.setLength(0);
- } else if (parseState.lastRectangle.getX() > -1 &&
deltaGreaterThan(currRectangle.getMinX(), parseState.lastRectangle.getMaxX(),
fudgeFactorX)) {
- buffer.append(" ");
- }
- } else {
- //currRectangle was not vaguely reasonable, interpolate a space
and hope for the best
+ Rectangle2D currRectangle = getCurrentRectangle(record);
+ if (parseState.lastRectangle.getY() > -1 &&
+ deltaGreaterThan(parseState.lastRectangle.getMinY(),
currRectangle.getMinY(), 0.0001)) {
+ xhtml.startElement("p");
+ xhtml.characters(buffer.toString());
+ xhtml.endElement("p");
+ buffer.setLength(0);
+ } else if (parseState.lastRectangle.getX() > -1 &&
+ deltaGreaterThan(currRectangle.getMinX(),
+ parseState.lastRectangle.getMaxX(), fudgeFactorX)) {
buffer.append(" ");
}
//do something better than this
- String txt = textRecord.getText(charset);
+ String txt = record.getText(charset);
buffer.append(txt);
parseState.lastRectangle = currRectangle;
}
- private double getYFudge(Rectangle2D lastRectangle, Rectangle2D
currRectangle) {
- if (lastRectangle.getHeight() >= 1 && currRectangle.getHeight() >=
1.0) {
- return 0.1 * Math.max(lastRectangle.getHeight(),
currRectangle.getHeight());
- }
- return 0.1;
- }
-
private boolean deltaGreaterThan(double a, double b, double delta) {
return (Math.abs(a - b) > delta);
}
- private Rectangle2D getCurrentRectangle(HemfText.EmfExtTextOutA
extTextOutA,
- HemfMisc.EmfModifyWorldTransform
lastModifyWorldTransform) {
+ private Rectangle2D getCurrentRectangle(HemfText.EmfExtTextOutA
extTextOutA) {
//This gets the current rectangle out of the emfextTextOutA record.
//via TIKA-4432, if the rectangle is 0,0,0,0 then back-off to the
bounds ignored, if those exist
+ //TODO: maybe use modifyWorldTransform and calculate font width etc...
Rectangle2D bounds = extTextOutA.getBounds();
- double smidge = 0.00001;
- if (notZero(bounds, smidge) && gteZero(bounds)) {
+ double smidge = 0.000000001;
+ if (deltaGreaterThan(bounds.getX(), 0.0d, smidge) ||
+ deltaGreaterThan(bounds.getY(), 0.0d, smidge) ||
+ deltaGreaterThan(bounds.getWidth(), 0.0d, smidge) ||
+ deltaGreaterThan(bounds.getHeight(), 0.0d, smidge)) {
return bounds;
}
- //if that didn't work, fall back to boundsIgnored
Supplier<?> boundsIgnored =
extTextOutA.getGenericProperties().get("boundsIgnored");
if (boundsIgnored == null) {
return bounds;
@@ -218,35 +199,7 @@ public class EMFParser implements Parser {
if (! (maybeBounds instanceof Rectangle2D)) {
return bounds;
}
- Rectangle2D ret = (Rectangle2D) maybeBounds;
- if (notZero(ret, smidge) && gteZero(ret)) {
- return ret;
- }
- //if that didn't work fall back to the lastModifyWorldTransform if it
is not null
- if (lastModifyWorldTransform == null) {
- return bounds;
- }
-
- if (lastModifyWorldTransform.getXForm().getTranslateX() > 0.0 &&
- lastModifyWorldTransform.getXForm().getTranslateY() > 0.0) {
- return new
Rectangle2D.Double(lastModifyWorldTransform.getXForm().getTranslateX(),
- lastModifyWorldTransform.getXForm().getTranslateY(), 10,
10);
- }
- return bounds;
- }
-
- private boolean notZero(Rectangle2D bounds, double smidge) {
- //require that at least one value is > 0
- return deltaGreaterThan(bounds.getMinX(), 0.0d, smidge)
- || deltaGreaterThan(bounds.getMaxX(), 0.0, smidge)
- || deltaGreaterThan(bounds.getMinY(), 0.0, smidge)
- || deltaGreaterThan(bounds.getMaxY(), 0.0, smidge);
- }
-
- private boolean gteZero(Rectangle2D bounds) {
- //require that there be no negative coordinates
- return bounds.getMinX() >= 0.0 && bounds.getMaxX() >= 0.0 &&
- bounds.getMinY() >= 0.0 && bounds.getMaxY() >= 0.0;
+ return (Rectangle2D) maybeBounds;
}
private void handleCommentData(