Author: tilman
Date: Sat Mar  8 08:05:39 2025
New Revision: 1924224

URL: http://svn.apache.org/viewvc?rev=1924224&view=rev
Log:
PDFBOX-5230: make zero-width non-joiner characters invisible, by Daniel Gredler

Modified:
    
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
    pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
    
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
    
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java

Modified: 
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
--- 
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
 (original)
+++ 
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
 Sat Mar  8 08:05:39 2025
@@ -51,7 +51,7 @@ public class GlyfSimpleDescript extends
         super(numberOfContours);
 
         /*
-         * https://developer.apple.com/fonts/TTRefMan/RM06/Chap6glyf.html
+         * 
https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6glyf.html
          * "If a glyph has zero contours, it need not have any glyph data." 
set the pointCount to zero to initialize
          * attributes and avoid nullpointer but maybe there shouldn't have 
GlyphDescript in the GlyphData?
          */

Modified: 
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
--- pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java 
(original)
+++ pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java 
Sat Mar  8 08:05:39 2025
@@ -26,6 +26,7 @@ import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Calendar;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -52,7 +53,7 @@ public final class TTFSubsetter
 {
     private static final Logger LOG = LogManager.getLogger(TTFSubsetter.class);
     
-    private static final byte[] PAD_BUF = { 0, 0, 0 };
+    private static final byte[] PAD_BUF = { 0, 0, 0, 0 };
 
     private static final TimeZone TIMEZONE_UTC = TimeZone.getTimeZone("UTC"); 
// clone before using
 
@@ -62,6 +63,7 @@ public final class TTFSubsetter
 
     private final List<String> keepTables;
     private final SortedSet<Integer> glyphIds; // new glyph ids
+    private final Set<Integer> invisibleGlyphIds;
     private String prefix;
     private boolean hasAddedCompoundReferences;
 
@@ -92,6 +94,7 @@ public final class TTFSubsetter
 
         uniToGID = new TreeMap<>();
         glyphIds = new TreeSet<>();
+        invisibleGlyphIds = new HashSet<>();
 
         // find the best Unicode cmap
         this.unicodeCmap = ttf.getUnicodeCmapLookup();
@@ -136,6 +139,23 @@ public final class TTFSubsetter
     }
 
     /**
+     * Forces the glyph for the specified character code to be zero-width and 
contour-free,
+     * regardless of what the glyph looks like in the original font. Note that 
the specified
+     * character code is not added to the subset unless it is also {@link 
#add(int) added}
+     * separately.
+     *
+     * @param unicode the character code whose glyph should be invisible
+     */
+    public void forceInvisible(int unicode)
+    {
+        int gid = unicodeCmap.getGlyphId(unicode);
+        if (gid != 0)
+        {
+            invisibleGlyphIds.add(gid);
+        }
+    }
+
+    /**
      * Returns the map of new -&gt; old GIDs.
      * 
      * @return the GID map
@@ -611,6 +631,13 @@ public final class TTFSubsetter
                             isResult);
                 }
 
+                // glyphs with no outlines have an empty entry in the 'glyf' 
table, with a
+                // corresponding 'loca' table entry with length = 0
+                if (invisibleGlyphIds.contains(gid))
+                {
+                    continue;
+                }
+
                 byte[] buf = new byte[(int)length];
                 isResult = is.read(buf);
 
@@ -921,9 +948,18 @@ public final class TTFSubsetter
                 long offset;
                 if (glyphId <= lastgid)
                 {
-                    // copy width and lsb
-                    offset = glyphId * 4l;
-                    lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
+                    if (invisibleGlyphIds.contains(glyphId))
+                    {
+                        // force zero width (no change to last offset)
+                        // 4 bytes total, 2 bytes each for: advance width = 0, 
left side bearing = 0
+                        bos.write(PAD_BUF, 0, 4);
+                    }
+                    else
+                    {
+                        // copy width and lsb
+                        offset = glyphId * 4l;
+                        lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
+                    }
                 }
                 else 
                 {

Modified: 
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
--- 
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java 
(original)
+++ 
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java 
Sat Mar  8 08:05:39 2025
@@ -29,6 +29,7 @@ import org.apache.pdfbox.io.RandomAccess
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -294,4 +295,76 @@ class TTFSubsetterTest
             subsetter.writeToStream(output);
         }
     }
+
+    /**
+     * Test of PDFBOX-5230: check that subsetting can be forced to use 
invisible glyphs.
+     *
+     * @throws java.io.IOException
+     */
+    @Test
+    void testPDFBox5230() throws IOException
+    {
+        final File testFile = new 
File("src/test/resources/ttf/LiberationSans-Regular.ttf");
+        TrueTypeFont ttf = new TTFParser().parse(new 
RandomAccessReadBufferedFile(testFile));
+        TTFSubsetter ttfSubsetter = new TTFSubsetter(ttf);
+        ttfSubsetter.add('A');
+        ttfSubsetter.add('B');
+        ttfSubsetter.add('\u200C');
+
+        // verify results without forcing
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        ttfSubsetter.writeToStream(baos);
+        try (TrueTypeFont subset = new TTFParser(true)
+                .parse(new RandomAccessReadBuffer(baos.toByteArray())))
+        {
+            assertEquals(4, subset.getNumberOfGlyphs());
+            assertEquals(0, subset.nameToGID(".notdef"));
+            assertEquals(1, subset.nameToGID("A"));
+            assertEquals(2, subset.nameToGID("B"));
+            assertEquals(3, subset.nameToGID("uni200C"));
+
+            PostScriptTable pst = subset.getPostScript();
+            assertEquals(".notdef", pst.getName(0));
+            assertEquals("A", pst.getName(1));
+            assertEquals("B", pst.getName(2));
+            assertEquals("uni200C", pst.getName(3));
+
+            assertFalse(subset.getPath("A").getBounds2D().isEmpty(), "A path 
should not be empty");
+            assertFalse(subset.getPath("B").getBounds2D().isEmpty(), "B path 
should not be empty");
+            assertFalse(subset.getPath("uni200C").getBounds2D().isEmpty(), 
"ZWNJ path should not be empty");
+            assertNotEquals(0, subset.getWidth("A"), "A width should not be 
zero.");
+            assertNotEquals(0, subset.getWidth("B"), "B width should not be 
zero.");
+            assertEquals(0, subset.getWidth("uni200C"), "ZWNJ width should be 
zero");
+        }
+
+        // verify results while forcing B and ZWNJ to use invisible glyphs
+
+        ttfSubsetter.forceInvisible('B');
+        ttfSubsetter.forceInvisible('\u200C');
+        ByteArrayOutputStream baos2 = new ByteArrayOutputStream();
+        ttfSubsetter.writeToStream(baos2);
+        try (TrueTypeFont subset = new TTFParser(true)
+                .parse(new RandomAccessReadBuffer(baos2.toByteArray())))
+        {
+            assertEquals(4, subset.getNumberOfGlyphs());
+            assertEquals(0, subset.nameToGID(".notdef"));
+            assertEquals(1, subset.nameToGID("A"));
+            assertEquals(2, subset.nameToGID("B"));
+            assertEquals(3, subset.nameToGID("uni200C"));
+
+            PostScriptTable pst = subset.getPostScript();
+            assertEquals(".notdef", pst.getName(0));
+            assertEquals("A", pst.getName(1));
+            assertEquals("B", pst.getName(2));
+            assertEquals("uni200C", pst.getName(3));
+
+            assertFalse(subset.getPath("A").getBounds2D().isEmpty(), "A path 
should not be empty");
+            assertTrue(subset.getPath("B").getBounds2D().isEmpty(), "B path 
should be empty");
+            assertTrue(subset.getPath("uni200C").getBounds2D().isEmpty(), 
"ZWNJ path should be empty");
+            assertNotEquals(0, subset.getWidth("A"), "A width should not be 
zero.");
+            assertEquals(0, subset.getWidth("B"), "B width should be zero.");
+            assertEquals(0, subset.getWidth("uni200C"), "ZWNJ width should be 
zero");
+        }
+    }
 }

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
 Sat Mar  8 08:05:39 2025
@@ -326,6 +326,10 @@ abstract class TrueTypeEmbedder implemen
         // set the GIDs to subset
         TTFSubsetter subsetter = new TTFSubsetter(ttf, tables);
         subsetter.addAll(subsetCodePoints);
+        subsetter.forceInvisible('\u200B'); // ZWSP
+        subsetter.forceInvisible('\u200C'); // ZWNJ
+        subsetter.forceInvisible('\u2060'); // WJ
+        subsetter.forceInvisible('\uFEFF'); // ZWNBSP
 
         if (!allGlyphIds.isEmpty())
         {

Modified: 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java?rev=1924224&r1=1924223&r2=1924224&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
 Sat Mar  8 08:05:39 2025
@@ -35,6 +35,7 @@ import org.apache.pdfbox.cos.COSDictiona
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.PDPageContentStream;
 import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
@@ -692,4 +693,55 @@ class TestFontEmbedding
             fail();
         }
     }
+
+    /**
+     * PDFBOX-5230: Zero-width characters should be invisible.
+     *
+     * @throws IOException
+     */
+    @Test
+    void testEmbeddedFontWithZeroWidthChars() throws IOException
+    {
+        String text = "AAA\u200CBBB";
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (PDDocument document = new PDDocument())
+        {
+            PDPage page = new PDPage();
+            document.addPage(page);
+            InputStream input = PDFont.class.getResourceAsStream(
+                    
"/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
+            PDType0Font font = PDType0Font.load(document, input);
+            try (PDPageContentStream stream = new 
PDPageContentStream(document, page))
+            {
+                stream.beginText();
+                stream.setFont(font, 20);
+                stream.newLineAtOffset(50, 600);
+                stream.showText(text);
+                stream.endText();
+            }
+            document.save(baos);
+        }
+        try (PDDocument document = Loader.loadPDF(baos.toByteArray()))
+        {
+            // verify that the text still contains zero-width characters
+            PDFTextStripper stripper = new PDFTextStripper();
+            String extractedText = stripper.getText(document).trim();
+            assertEquals(text, extractedText);
+            assertEquals(7, extractedText.length());
+            assertEquals('\u200C', extractedText.charAt(3));
+
+            // verify that the zero-width characters are invisible
+            PDPage page = document.getPage(0);
+            PDResources resources = page.getResources();
+            Iterable< COSName > fontNames = resources.getFontNames();
+            COSName fontName = fontNames.iterator().next();
+            PDType0Font font = (PDType0Font) resources.getFont(fontName);
+            byte[] encoded = font.encode('\u200C');
+            int code = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
+            assertEquals(0, font.getWidth(code));
+            assertEquals(0, font.getWidthFromFont(code));
+            assertTrue(font.getPath(code).getBounds2D().isEmpty());
+            assertFalse(font.isDamaged());
+        }
+    }
 }


Reply via email to