Author: tilman
Date: Sat Mar 8 08:05:39 2025
New Revision: 1924224
URL: http://svn.apache.org/viewvc?rev=1924224&view=rev
Log:
PDFBOX-5230: make zero-width non-joiner characters invisible, by Daniel Gredler
Modified:
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
Modified:
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
---
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
(original)
+++
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
Sat Mar 8 08:05:39 2025
@@ -51,7 +51,7 @@ public class GlyfSimpleDescript extends
super(numberOfContours);
/*
- * https://developer.apple.com/fonts/TTRefMan/RM06/Chap6glyf.html
+ *
https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6glyf.html
* "If a glyph has zero contours, it need not have any glyph data."
set the pointCount to zero to initialize
* attributes and avoid nullpointer but maybe there shouldn't have
GlyphDescript in the GlyphData?
*/
Modified:
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
--- pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
(original)
+++ pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
Sat Mar 8 08:05:39 2025
@@ -26,6 +26,7 @@ import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Calendar;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
@@ -52,7 +53,7 @@ public final class TTFSubsetter
{
private static final Logger LOG = LogManager.getLogger(TTFSubsetter.class);
- private static final byte[] PAD_BUF = { 0, 0, 0 };
+ private static final byte[] PAD_BUF = { 0, 0, 0, 0 };
private static final TimeZone TIMEZONE_UTC = TimeZone.getTimeZone("UTC");
// clone before using
@@ -62,6 +63,7 @@ public final class TTFSubsetter
private final List<String> keepTables;
private final SortedSet<Integer> glyphIds; // new glyph ids
+ private final Set<Integer> invisibleGlyphIds;
private String prefix;
private boolean hasAddedCompoundReferences;
@@ -92,6 +94,7 @@ public final class TTFSubsetter
uniToGID = new TreeMap<>();
glyphIds = new TreeSet<>();
+ invisibleGlyphIds = new HashSet<>();
// find the best Unicode cmap
this.unicodeCmap = ttf.getUnicodeCmapLookup();
@@ -136,6 +139,23 @@ public final class TTFSubsetter
}
/**
+ * Forces the glyph for the specified character code to be zero-width and
contour-free,
+ * regardless of what the glyph looks like in the original font. Note that
the specified
+ * character code is not added to the subset unless it is also {@link
#add(int) added}
+ * separately.
+ *
+ * @param unicode the character code whose glyph should be invisible
+ */
+ public void forceInvisible(int unicode)
+ {
+ int gid = unicodeCmap.getGlyphId(unicode);
+ if (gid != 0)
+ {
+ invisibleGlyphIds.add(gid);
+ }
+ }
+
+ /**
* Returns the map of new -> old GIDs.
*
* @return the GID map
@@ -611,6 +631,13 @@ public final class TTFSubsetter
isResult);
}
+ // glyphs with no outlines have an empty entry in the 'glyf'
table, with a
+ // corresponding 'loca' table entry with length = 0
+ if (invisibleGlyphIds.contains(gid))
+ {
+ continue;
+ }
+
byte[] buf = new byte[(int)length];
isResult = is.read(buf);
@@ -921,9 +948,18 @@ public final class TTFSubsetter
long offset;
if (glyphId <= lastgid)
{
- // copy width and lsb
- offset = glyphId * 4l;
- lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
+ if (invisibleGlyphIds.contains(glyphId))
+ {
+ // force zero width (no change to last offset)
+ // 4 bytes total, 2 bytes each for: advance width = 0,
left side bearing = 0
+ bos.write(PAD_BUF, 0, 4);
+ }
+ else
+ {
+ // copy width and lsb
+ offset = glyphId * 4l;
+ lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
+ }
}
else
{
Modified:
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
---
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
(original)
+++
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
Sat Mar 8 08:05:39 2025
@@ -29,6 +29,7 @@ import org.apache.pdfbox.io.RandomAccess
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -294,4 +295,76 @@ class TTFSubsetterTest
subsetter.writeToStream(output);
}
}
+
+ /**
+ * Test of PDFBOX-5230: check that subsetting can be forced to use
invisible glyphs.
+ *
+ * @throws java.io.IOException
+ */
+ @Test
+ void testPDFBox5230() throws IOException
+ {
+ final File testFile = new
File("src/test/resources/ttf/LiberationSans-Regular.ttf");
+ TrueTypeFont ttf = new TTFParser().parse(new
RandomAccessReadBufferedFile(testFile));
+ TTFSubsetter ttfSubsetter = new TTFSubsetter(ttf);
+ ttfSubsetter.add('A');
+ ttfSubsetter.add('B');
+ ttfSubsetter.add('\u200C');
+
+ // verify results without forcing
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ ttfSubsetter.writeToStream(baos);
+ try (TrueTypeFont subset = new TTFParser(true)
+ .parse(new RandomAccessReadBuffer(baos.toByteArray())))
+ {
+ assertEquals(4, subset.getNumberOfGlyphs());
+ assertEquals(0, subset.nameToGID(".notdef"));
+ assertEquals(1, subset.nameToGID("A"));
+ assertEquals(2, subset.nameToGID("B"));
+ assertEquals(3, subset.nameToGID("uni200C"));
+
+ PostScriptTable pst = subset.getPostScript();
+ assertEquals(".notdef", pst.getName(0));
+ assertEquals("A", pst.getName(1));
+ assertEquals("B", pst.getName(2));
+ assertEquals("uni200C", pst.getName(3));
+
+ assertFalse(subset.getPath("A").getBounds2D().isEmpty(), "A path
should not be empty");
+ assertFalse(subset.getPath("B").getBounds2D().isEmpty(), "B path
should not be empty");
+ assertFalse(subset.getPath("uni200C").getBounds2D().isEmpty(),
"ZWNJ path should not be empty");
+ assertNotEquals(0, subset.getWidth("A"), "A width should not be
zero.");
+ assertNotEquals(0, subset.getWidth("B"), "B width should not be
zero.");
+ assertEquals(0, subset.getWidth("uni200C"), "ZWNJ width should be
zero");
+ }
+
+ // verify results while forcing B and ZWNJ to use invisible glyphs
+
+ ttfSubsetter.forceInvisible('B');
+ ttfSubsetter.forceInvisible('\u200C');
+ ByteArrayOutputStream baos2 = new ByteArrayOutputStream();
+ ttfSubsetter.writeToStream(baos2);
+ try (TrueTypeFont subset = new TTFParser(true)
+ .parse(new RandomAccessReadBuffer(baos2.toByteArray())))
+ {
+ assertEquals(4, subset.getNumberOfGlyphs());
+ assertEquals(0, subset.nameToGID(".notdef"));
+ assertEquals(1, subset.nameToGID("A"));
+ assertEquals(2, subset.nameToGID("B"));
+ assertEquals(3, subset.nameToGID("uni200C"));
+
+ PostScriptTable pst = subset.getPostScript();
+ assertEquals(".notdef", pst.getName(0));
+ assertEquals("A", pst.getName(1));
+ assertEquals("B", pst.getName(2));
+ assertEquals("uni200C", pst.getName(3));
+
+ assertFalse(subset.getPath("A").getBounds2D().isEmpty(), "A path
should not be empty");
+ assertTrue(subset.getPath("B").getBounds2D().isEmpty(), "B path
should be empty");
+ assertTrue(subset.getPath("uni200C").getBounds2D().isEmpty(),
"ZWNJ path should be empty");
+ assertNotEquals(0, subset.getWidth("A"), "A width should not be
zero.");
+ assertEquals(0, subset.getWidth("B"), "B width should be zero.");
+ assertEquals(0, subset.getWidth("uni200C"), "ZWNJ width should be
zero");
+ }
+ }
}
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
(original)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
Sat Mar 8 08:05:39 2025
@@ -326,6 +326,10 @@ abstract class TrueTypeEmbedder implemen
// set the GIDs to subset
TTFSubsetter subsetter = new TTFSubsetter(ttf, tables);
subsetter.addAll(subsetCodePoints);
+ subsetter.forceInvisible('\u200B'); // ZWSP
+ subsetter.forceInvisible('\u200C'); // ZWNJ
+ subsetter.forceInvisible('\u2060'); // WJ
+ subsetter.forceInvisible('\uFEFF'); // ZWNBSP
if (!allGlyphIds.isEmpty())
{
Modified:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
(original)
+++
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
Sat Mar 8 08:05:39 2025
@@ -35,6 +35,7 @@ import org.apache.pdfbox.cos.COSDictiona
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
@@ -692,4 +693,55 @@ class TestFontEmbedding
fail();
}
}
+
+ /**
+ * PDFBOX-5230: Zero-width characters should be invisible.
+ *
+ * @throws IOException
+ */
+ @Test
+ void testEmbeddedFontWithZeroWidthChars() throws IOException
+ {
+ String text = "AAA\u200CBBB";
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ try (PDDocument document = new PDDocument())
+ {
+ PDPage page = new PDPage();
+ document.addPage(page);
+ InputStream input = PDFont.class.getResourceAsStream(
+
"/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
+ PDType0Font font = PDType0Font.load(document, input);
+ try (PDPageContentStream stream = new
PDPageContentStream(document, page))
+ {
+ stream.beginText();
+ stream.setFont(font, 20);
+ stream.newLineAtOffset(50, 600);
+ stream.showText(text);
+ stream.endText();
+ }
+ document.save(baos);
+ }
+ try (PDDocument document = Loader.loadPDF(baos.toByteArray()))
+ {
+ // verify that the text still contains zero-width characters
+ PDFTextStripper stripper = new PDFTextStripper();
+ String extractedText = stripper.getText(document).trim();
+ assertEquals(text, extractedText);
+ assertEquals(7, extractedText.length());
+ assertEquals('\u200C', extractedText.charAt(3));
+
+ // verify that the zero-width characters are invisible
+ PDPage page = document.getPage(0);
+ PDResources resources = page.getResources();
+ Iterable< COSName > fontNames = resources.getFontNames();
+ COSName fontName = fontNames.iterator().next();
+ PDType0Font font = (PDType0Font) resources.getFont(fontName);
+ byte[] encoded = font.encode('\u200C');
+ int code = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
+ assertEquals(0, font.getWidth(code));
+ assertEquals(0, font.getWidthFromFont(code));
+ assertTrue(font.getPath(code).getBounds2D().isEmpty());
+ assertFalse(font.isDamaged());
+ }
+ }
}