[ https://issues.apache.org/jira/browse/PDFBOX-5126?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17298045#comment-17298045 ]
Gábor Stefanik edited comment on PDFBOX-5126 at 3/9/21, 12:31 PM: ------------------------------------------------------------------ If it helps, this is the workaround we currently have in place for this issue: {code:java} import java.io.IOException; import java.text.Bidi; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; public class BiDirectionalPDFTextStripper extends PDFTextStripper { @Override protected void writeString(String string, List<TextPosition> pos) throws IOException { pos = processBidi(pos); super.writeString(positionsToString(pos), pos); } private static String positionsToString(List<TextPosition> pos) { return pos.stream().map(TextPosition::toString).collect(Collectors.joining()); } private List<TextPosition> processBidi(List<TextPosition> pos) { String word = positionsToString(pos); Bidi bidi = new Bidi(word, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); Map<Integer, Integer> char2glyph = new HashMap<>(); int p = 0; for (int i = 0; i < pos.size(); i++) { char2glyph.put(p, i); p += pos.get(i).getUnicode().length(); } char2glyph.put(p, pos.size()); // if there is pure LTR text no need to process further if (!bidi.isMixed() && bidi.getBaseLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT) { return pos; } // collect individual bidi information int runCount = bidi.getRunCount(); byte[] levels = new byte[runCount]; Integer[] runs = new Integer[runCount]; for (int i = 0; i < runCount; i++) { levels[i] = (byte) bidi.getRunLevel(i); runs[i] = i; } // reorder individual parts based on their levels Bidi.reorderVisually(levels, 0, runs, 0, runCount); List<TextPosition> newPos = new ArrayList<>(); for (int i = 0; i < runCount; i++) { int index = runs[i]; int start = bidi.getRunStart(index); int end = bidi.getRunLimit(index); int level = levels[index]; if ((level & 1) != 0) { while (--end >= start) { if (!char2glyph.containsKey(end)) continue; newPos.add(pos.get(char2glyph.get(end))); } } else { newPos.addAll(pos.subList(char2glyph.get(start), char2glyph.get(end))); } } return newPos; } } {code} I hope this is easier to adapt into an actual fix than working from scratch. was (Author: googulator): If it helps, this is the workaround we currently have in place for this issue: {code:java} import java.io.IOException; import java.text.Bidi; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; public class BiDirectionalPDFTextStripper extends PDFTextStripper { @Override protected void writeString(String string, List<TextPosition> pos) throws IOException { pos = processBidi(pos); super.writeString(positionsToString(pos), pos); } private static String positionsToString(List<TextPosition> pos) { return pos.stream().map(TextPosition::toString).collect(Collectors.joining()); } private List<TextPosition> processBidi(List<TextPosition> pos) { String word = positionsToString(pos); Bidi bidi = new Bidi(word, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); Map<Integer, Integer> char2glyph = new HashMap<>(); int p = 0; for (int i = 0; i < pos.size(); i++) { char2glyph.put(p, i); p += pos.get(i).getUnicode().length(); } char2glyph.put(p, pos.size()); // if there is pure LTR text no need to process further if (!bidi.isMixed() && bidi.getBaseLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT) { return pos; } // collect individual bidi information int runCount = bidi.getRunCount(); byte[] levels = new byte[runCount]; Integer[] runs = new Integer[runCount]; for (int i = 0; i < runCount; i++) { levels[i] = (byte) bidi.getRunLevel(i); runs[i] = i; } // reorder individual parts based on their levels Bidi.reorderVisually(levels, 0, runs, 0, runCount); List<TextPosition> newPos = new ArrayList<>(); for (int i = 0; i < runCount; i++) { int index = runs[i]; int start = bidi.getRunStart(index); int end = bidi.getRunLimit(index); int level = levels[index]; if ((level & 1) != 0) { while (--end >= start) { if (!char2glyph.containsKey(end)) continue; newPos.add(pos.get(char2glyph.get(end))); } } else { newPos.addAll(pos.subList(char2glyph.get(start), char2glyph.get(end))); } } return newPos; } } {code} I hope this is easier to adapt into an actual fix than working from scratch. > Complex Unicode glyphs (surrogate pairs, combining diacritics, zero-width > join, etc.) in a RTL context get reversed incorrectly on text extraction > -------------------------------------------------------------------------------------------------------------------------------------------------- > > Key: PDFBOX-5126 > URL: https://issues.apache.org/jira/browse/PDFBOX-5126 > Project: PDFBox > Issue Type: Bug > Components: Text extraction > Affects Versions: 2.0.22 > Reporter: Gábor Stefanik > Priority: Major > Attachments: rovasvegyes.pdf > > > The attached PDF contains old Hungarian runic script, which is both > right-to-left and outside Unicode's Basic Multilingual Plane (and thus > encoded as surrogate pairs in Java's internal UTF-16-like representation). > When this text is extracted, the surrogate pairs are reversed due to an > overly naive use of "char"-level reversal, leading to malformed Unicode > output. > Likewise, when combining diacritics/modifiers occur in a right-to-left > context, their position relative to the "parent" character is reversed, and > so they appear on the wrong glyph, as demonstrated by the Hebrew sample in > the same PDF. I imagine the same thing would also happen to emoji using the > "zero-width joiner" in an RTL context. -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org For additional commands, e-mail: dev-h...@pdfbox.apache.org