Author: tilman
Date: Sat Mar 8 09:29:12 2025
New Revision: 1924227
URL: http://svn.apache.org/viewvc?rev=1924227&view=rev
Log:
PDFBOX-5230: make zero-width non-joiner characters invisible, by Daniel
Gredler; closes #203
Modified:
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
Modified:
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
---
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
(original)
+++
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
Sat Mar 8 09:29:12 2025
@@ -62,7 +62,7 @@ public class GlyfSimpleDescript extends
super(numberOfContours, bais);
/*
- * https://developer.apple.com/fonts/TTRefMan/RM06/Chap6glyf.html
+ *
https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6glyf.html
* "If a glyph has zero contours, it need not have any glyph data."
set the pointCount to zero to initialize
* attributes and avoid nullpointer but maybe there shouldn't have
GlyphDescript in the GlyphData?
*/
Modified:
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
---
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
(original)
+++
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
Sat Mar 8 09:29:12 2025
@@ -25,6 +25,7 @@ import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.Calendar;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
@@ -51,7 +52,7 @@ public final class TTFSubsetter
{
private static final Log LOG = LogFactory.getLog(TTFSubsetter.class);
- private static final byte[] PAD_BUF = new byte[] { 0, 0, 0 };
+ private static final byte[] PAD_BUF = new byte[] { 0, 0, 0, 0 };
private static final TimeZone TIMEZONE_UTC = TimeZone.getTimeZone("UTC");
// clone before using
@@ -61,6 +62,7 @@ public final class TTFSubsetter
private final List<String> keepTables;
private final SortedSet<Integer> glyphIds; // new glyph ids
+ private final Set<Integer> invisibleGlyphIds;
private String prefix;
private boolean hasAddedCompoundReferences;
@@ -87,6 +89,7 @@ public final class TTFSubsetter
uniToGID = new TreeMap<Integer, Integer>();
glyphIds = new TreeSet<Integer>();
+ invisibleGlyphIds = new HashSet<Integer>();
// find the best Unicode cmap
this.unicodeCmap = ttf.getUnicodeCmapLookup();
@@ -132,6 +135,23 @@ public final class TTFSubsetter
}
/**
+ * Forces the glyph for the specified character code to be zero-width and
contour-free,
+ * regardless of what the glyph looks like in the original font. Note that
the specified
+ * character code is not added to the subset unless it is also {@link
#add(int) added}
+ * separately.
+ *
+ * @param unicode the character code whose glyph should be invisible
+ */
+ public void forceInvisible(int unicode)
+ {
+ int gid = unicodeCmap.getGlyphId(unicode);
+ if (gid != 0)
+ {
+ invisibleGlyphIds.add(gid);
+ }
+ }
+
+ /**
* Returns the map of new -> old GIDs.
*/
public Map<Integer, Integer> getGIDMap() throws IOException
@@ -587,6 +607,13 @@ public final class TTFSubsetter
newOffsets[newGid++] = newOffset;
is.skip(offset - prevEnd);
+ // glyphs with no outlines have an empty entry in the 'glyf'
table, with a
+ // corresponding 'loca' table entry with length = 0
+ if (invisibleGlyphIds.contains(gid))
+ {
+ continue;
+ }
+
byte[] buf = new byte[(int)length];
is.read(buf);
@@ -893,9 +920,18 @@ public final class TTFSubsetter
long offset;
if (glyphId <= lastgid)
{
- // copy width and lsb
- offset = glyphId * 4l;
- lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
+ if (invisibleGlyphIds.contains(glyphId))
+ {
+ // force zero width (no change to last offset)
+ // 4 bytes total, 2 bytes each for: advance width = 0,
left side bearing = 0
+ bos.write(PAD_BUF, 0, 4);
+ }
+ else
+ {
+ // copy width and lsb
+ offset = glyphId * 4l;
+ lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
+ }
}
else
{
Modified:
pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
---
pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
(original)
+++
pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
Sat Mar 8 09:29:12 2025
@@ -27,6 +27,7 @@ import java.util.Map.Entry;
import org.apache.fontbox.util.autodetect.FontFileFinder;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -274,4 +275,76 @@ public class TTFSubsetterTest
subsetter.writeToStream(output);
ttf.close();
}
+
+ /**
+ * Test of PDFBOX-5230: check that subsetting can be forced to use
invisible glyphs.
+ *
+ * @throws java.io.IOException
+ */
+ @Test
+ public void testPDFBox5230() throws IOException
+ {
+ final File testFile = new
File("src/test/resources/ttf/LiberationSans-Regular.ttf");
+ TrueTypeFont ttf = new TTFParser().parse(testFile);
+ TTFSubsetter ttfSubsetter = new TTFSubsetter(ttf);
+ ttfSubsetter.add('A');
+ ttfSubsetter.add('B');
+ ttfSubsetter.add('\u200C');
+
+ // verify results without forcing
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ ttfSubsetter.writeToStream(baos);
+ TrueTypeFont subset = new TTFParser(true)
+ .parse(new ByteArrayInputStream(baos.toByteArray()));
+ assertEquals(4, subset.getNumberOfGlyphs());
+ assertEquals(0, subset.nameToGID(".notdef"));
+ assertEquals(1, subset.nameToGID("A"));
+ assertEquals(2, subset.nameToGID("B"));
+ assertEquals(3, subset.nameToGID("uni200C"));
+
+ PostScriptTable pst = subset.getPostScript();
+ assertEquals(".notdef", pst.getName(0));
+ assertEquals("A", pst.getName(1));
+ assertEquals("B", pst.getName(2));
+ assertEquals("uni200C", pst.getName(3));
+
+ assertFalse("A path should not be empty",
subset.getPath("A").getBounds2D().isEmpty());
+ assertFalse("B path should not be empty",
subset.getPath("B").getBounds2D().isEmpty());
+ assertFalse("ZWNJ path should not be empty",
subset.getPath("uni200C").getBounds2D().isEmpty());
+ assertNotEquals("A width should not be zero.", 0,
subset.getWidth("A"));
+ assertNotEquals("B width should not be zero.", 0,
subset.getWidth("B"));
+ assertEquals("ZWNJ width should be zero", 0,
subset.getWidth("uni200C"), 0);
+
+ subset.close();
+
+ // verify results while forcing B and ZWNJ to use invisible glyphs
+
+ ttfSubsetter.forceInvisible('B');
+ ttfSubsetter.forceInvisible('\u200C');
+ ByteArrayOutputStream baos2 = new ByteArrayOutputStream();
+ ttfSubsetter.writeToStream(baos2);
+ subset = new TTFParser(true)
+ .parse(new ByteArrayInputStream(baos2.toByteArray()));
+ assertEquals(4, subset.getNumberOfGlyphs());
+ assertEquals(0, subset.nameToGID(".notdef"));
+ assertEquals(1, subset.nameToGID("A"));
+ assertEquals(2, subset.nameToGID("B"));
+ assertEquals(3, subset.nameToGID("uni200C"));
+
+ pst = subset.getPostScript();
+ assertEquals(".notdef", pst.getName(0));
+ assertEquals("A", pst.getName(1));
+ assertEquals("B", pst.getName(2));
+ assertEquals("uni200C", pst.getName(3));
+
+ assertFalse("A path should not be empty",
subset.getPath("A").getBounds2D().isEmpty());
+ assertTrue("B path should be empty",
subset.getPath("B").getBounds2D().isEmpty());
+ assertTrue("ZWNJ path should be empty",
subset.getPath("uni200C").getBounds2D().isEmpty());
+ assertNotEquals("A width should not be zero.", 0,
subset.getWidth("A"));
+ assertEquals("B width should be zero.", 0, subset.getWidth("B"), 0);
+ assertEquals("ZWNJ width should be zero", 0d,
subset.getWidth("uni200C"), 0);
+
+ subset.close();
+ }
}
Modified:
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
---
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
(original)
+++
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
Sat Mar 8 09:29:12 2025
@@ -347,6 +347,10 @@ abstract class TrueTypeEmbedder implemen
// set the GIDs to subset
TTFSubsetter subsetter = new TTFSubsetter(ttf, tables);
subsetter.addAll(subsetCodePoints);
+ subsetter.forceInvisible('\u200B'); // ZWSP
+ subsetter.forceInvisible('\u200C'); // ZWNJ
+ subsetter.forceInvisible('\u2060'); // WJ
+ subsetter.forceInvisible('\uFEFF'); // ZWNBSP
// calculate deterministic tag based on the chosen subset
Map<Integer, Integer> gidToCid = subsetter.getGIDMap();
Modified:
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
---
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
(original)
+++
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
Sat Mar 8 09:29:12 2025
@@ -38,6 +38,7 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
+import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.rendering.TestPDFToImage;
import org.apache.pdfbox.text.PDFTextStripper;
@@ -464,4 +465,51 @@ public class TestFontEmbedding extends T
System.err.println("Rendering of " + pdf + " failed or is not
identical to expected rendering in " + IN_DIR + " directory");
}
}
+
+ /**
+ * PDFBOX-5230: Zero-width characters should be invisible.
+ *
+ * @throws IOException
+ */
+ public void testEmbeddedFontWithZeroWidthChars() throws IOException
+ {
+ String text = "AAA\u200CBBB";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ PDDocument document = new PDDocument();
+ PDPage page = new PDPage();
+ document.addPage(page);
+ InputStream input = PDFont.class.getResourceAsStream(
+ "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
+ PDType0Font font = PDType0Font.load(document, input);
+ PDPageContentStream stream = new PDPageContentStream(document, page);
+ stream.beginText();
+ stream.setFont(font, 20);
+ stream.newLineAtOffset(50, 600);
+ stream.showText(text);
+ stream.endText();
+ stream.close();
+ document.save(baos);
+ document.close();
+ document = PDDocument.load(baos.toByteArray());
+ // verify that the text still contains zero-width characters
+ PDFTextStripper stripper = new PDFTextStripper();
+ String extractedText = stripper.getText(document).trim();
+ assertEquals(text, extractedText);
+ assertEquals(7, extractedText.length());
+ assertEquals('\u200C', extractedText.charAt(3));
+
+ // verify that the zero-width characters are invisible
+ page = document.getPage(0);
+ PDResources resources = page.getResources();
+ Iterable< COSName> fontNames = resources.getFontNames();
+ COSName fontName = fontNames.iterator().next();
+ font = (PDType0Font) resources.getFont(fontName);
+ byte[] encoded = font.encode('\u200C');
+ int code = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
+ assertEquals(0f, font.getWidth(code));
+ assertEquals(0f, font.getWidthFromFont(code));
+ assertTrue(font.getPath(code).getBounds2D().isEmpty());
+ assertFalse(font.isDamaged());
+ document.close();
+ }
}