Author: tilman
Date: Sat Mar  8 09:29:12 2025
New Revision: 1924227

URL: http://svn.apache.org/viewvc?rev=1924227&view=rev
Log:
PDFBOX-5230: make zero-width non-joiner characters invisible, by Daniel 
Gredler; closes #203

Modified:
    
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
    
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
    
pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
    
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
    
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java

Modified: 
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
--- 
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
 (original)
+++ 
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/GlyfSimpleDescript.java
 Sat Mar  8 09:29:12 2025
@@ -62,7 +62,7 @@ public class GlyfSimpleDescript extends
         super(numberOfContours, bais);
 
         /*
-         * https://developer.apple.com/fonts/TTRefMan/RM06/Chap6glyf.html
+         * 
https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6glyf.html
          * "If a glyph has zero contours, it need not have any glyph data." 
set the pointCount to zero to initialize
          * attributes and avoid nullpointer but maybe there shouldn't have 
GlyphDescript in the GlyphData?
          */

Modified: 
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
--- 
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
 (original)
+++ 
pdfbox/branches/2.0/fontbox/src/main/java/org/apache/fontbox/ttf/TTFSubsetter.java
 Sat Mar  8 09:29:12 2025
@@ -25,6 +25,7 @@ import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.util.Calendar;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -51,7 +52,7 @@ public final class TTFSubsetter
 {
     private static final Log LOG = LogFactory.getLog(TTFSubsetter.class);
     
-    private static final byte[] PAD_BUF = new byte[] { 0, 0, 0 };
+    private static final byte[] PAD_BUF = new byte[] { 0, 0, 0, 0 };
 
     private static final TimeZone TIMEZONE_UTC = TimeZone.getTimeZone("UTC"); 
// clone before using
 
@@ -61,6 +62,7 @@ public final class TTFSubsetter
 
     private final List<String> keepTables;
     private final SortedSet<Integer> glyphIds; // new glyph ids
+    private final Set<Integer> invisibleGlyphIds;
     private String prefix;
     private boolean hasAddedCompoundReferences;
 
@@ -87,6 +89,7 @@ public final class TTFSubsetter
 
         uniToGID = new TreeMap<Integer, Integer>();
         glyphIds = new TreeSet<Integer>();
+        invisibleGlyphIds = new HashSet<Integer>();
 
         // find the best Unicode cmap
         this.unicodeCmap = ttf.getUnicodeCmapLookup();
@@ -132,6 +135,23 @@ public final class TTFSubsetter
     }
 
     /**
+     * Forces the glyph for the specified character code to be zero-width and 
contour-free,
+     * regardless of what the glyph looks like in the original font. Note that 
the specified
+     * character code is not added to the subset unless it is also {@link 
#add(int) added}
+     * separately.
+     *
+     * @param unicode the character code whose glyph should be invisible
+     */
+    public void forceInvisible(int unicode)
+    {
+        int gid = unicodeCmap.getGlyphId(unicode);
+        if (gid != 0)
+        {
+            invisibleGlyphIds.add(gid);
+        }
+    }
+
+    /**
      * Returns the map of new -&gt; old GIDs.
      */
     public Map<Integer, Integer> getGIDMap() throws IOException
@@ -587,6 +607,13 @@ public final class TTFSubsetter
                 newOffsets[newGid++] = newOffset;
                 is.skip(offset - prevEnd);
 
+                // glyphs with no outlines have an empty entry in the 'glyf' 
table, with a
+                // corresponding 'loca' table entry with length = 0
+                if (invisibleGlyphIds.contains(gid))
+                {
+                    continue;
+                }
+
                 byte[] buf = new byte[(int)length];
                 is.read(buf);
 
@@ -893,9 +920,18 @@ public final class TTFSubsetter
                 long offset;
                 if (glyphId <= lastgid)
                 {
-                    // copy width and lsb
-                    offset = glyphId * 4l;
-                    lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
+                    if (invisibleGlyphIds.contains(glyphId))
+                    {
+                        // force zero width (no change to last offset)
+                        // 4 bytes total, 2 bytes each for: advance width = 0, 
left side bearing = 0
+                        bos.write(PAD_BUF, 0, 4);
+                    }
+                    else
+                    {
+                        // copy width and lsb
+                        offset = glyphId * 4l;
+                        lastOffset = copyBytes(is, bos, offset, lastOffset, 4);
+                    }
                 }
                 else 
                 {

Modified: 
pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
--- 
pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
 (original)
+++ 
pdfbox/branches/2.0/fontbox/src/test/java/org/apache/fontbox/ttf/TTFSubsetterTest.java
 Sat Mar  8 09:29:12 2025
@@ -27,6 +27,7 @@ import java.util.Map.Entry;
 import org.apache.fontbox.util.autodetect.FontFileFinder;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
@@ -274,4 +275,76 @@ public class TTFSubsetterTest
         subsetter.writeToStream(output);
         ttf.close();
     }
+
+    /**
+     * Test of PDFBOX-5230: check that subsetting can be forced to use 
invisible glyphs.
+     *
+     * @throws java.io.IOException
+     */
+    @Test
+    public void testPDFBox5230() throws IOException
+    {
+        final File testFile = new 
File("src/test/resources/ttf/LiberationSans-Regular.ttf");
+        TrueTypeFont ttf = new TTFParser().parse(testFile);
+        TTFSubsetter ttfSubsetter = new TTFSubsetter(ttf);
+        ttfSubsetter.add('A');
+        ttfSubsetter.add('B');
+        ttfSubsetter.add('\u200C');
+
+        // verify results without forcing
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        ttfSubsetter.writeToStream(baos);
+        TrueTypeFont subset = new TTFParser(true)
+                .parse(new ByteArrayInputStream(baos.toByteArray()));
+        assertEquals(4, subset.getNumberOfGlyphs());
+        assertEquals(0, subset.nameToGID(".notdef"));
+        assertEquals(1, subset.nameToGID("A"));
+        assertEquals(2, subset.nameToGID("B"));
+        assertEquals(3, subset.nameToGID("uni200C"));
+
+        PostScriptTable pst = subset.getPostScript();
+        assertEquals(".notdef", pst.getName(0));
+        assertEquals("A", pst.getName(1));
+        assertEquals("B", pst.getName(2));
+        assertEquals("uni200C", pst.getName(3));
+
+        assertFalse("A path should not be empty", 
subset.getPath("A").getBounds2D().isEmpty());
+        assertFalse("B path should not be empty", 
subset.getPath("B").getBounds2D().isEmpty());
+        assertFalse("ZWNJ path should not be empty", 
subset.getPath("uni200C").getBounds2D().isEmpty());
+        assertNotEquals("A width should not be zero.", 0, 
subset.getWidth("A"));
+        assertNotEquals("B width should not be zero.", 0, 
subset.getWidth("B"));
+        assertEquals("ZWNJ width should be zero", 0, 
subset.getWidth("uni200C"), 0);
+
+        subset.close();
+
+        // verify results while forcing B and ZWNJ to use invisible glyphs
+
+        ttfSubsetter.forceInvisible('B');
+        ttfSubsetter.forceInvisible('\u200C');
+        ByteArrayOutputStream baos2 = new ByteArrayOutputStream();
+        ttfSubsetter.writeToStream(baos2);
+        subset = new TTFParser(true)
+                .parse(new ByteArrayInputStream(baos2.toByteArray()));
+        assertEquals(4, subset.getNumberOfGlyphs());
+        assertEquals(0, subset.nameToGID(".notdef"));
+        assertEquals(1, subset.nameToGID("A"));
+        assertEquals(2, subset.nameToGID("B"));
+        assertEquals(3, subset.nameToGID("uni200C"));
+
+        pst = subset.getPostScript();
+        assertEquals(".notdef", pst.getName(0));
+        assertEquals("A", pst.getName(1));
+        assertEquals("B", pst.getName(2));
+        assertEquals("uni200C", pst.getName(3));
+
+        assertFalse("A path should not be empty", 
subset.getPath("A").getBounds2D().isEmpty());
+        assertTrue("B path should be empty", 
subset.getPath("B").getBounds2D().isEmpty());
+        assertTrue("ZWNJ path should be empty", 
subset.getPath("uni200C").getBounds2D().isEmpty());
+        assertNotEquals("A width should not be zero.", 0, 
subset.getWidth("A"));
+        assertEquals("B width should be zero.", 0, subset.getWidth("B"), 0);
+        assertEquals("ZWNJ width should be zero", 0d, 
subset.getWidth("uni200C"), 0);
+
+        subset.close();
+    }
 }

Modified: 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
--- 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
 (original)
+++ 
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/TrueTypeEmbedder.java
 Sat Mar  8 09:29:12 2025
@@ -347,6 +347,10 @@ abstract class TrueTypeEmbedder implemen
         // set the GIDs to subset
         TTFSubsetter subsetter = new TTFSubsetter(ttf, tables);
         subsetter.addAll(subsetCodePoints);
+        subsetter.forceInvisible('\u200B'); // ZWSP
+        subsetter.forceInvisible('\u200C'); // ZWNJ
+        subsetter.forceInvisible('\u2060'); // WJ
+        subsetter.forceInvisible('\uFEFF'); // ZWNBSP
 
         // calculate deterministic tag based on the chosen subset
         Map<Integer, Integer> gidToCid = subsetter.getGIDMap();

Modified: 
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java?rev=1924227&r1=1924226&r2=1924227&view=diff
==============================================================================
--- 
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
 (original)
+++ 
pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/font/TestFontEmbedding.java
 Sat Mar  8 09:29:12 2025
@@ -38,6 +38,7 @@ import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.PDPageContentStream;
 import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
+import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.rendering.TestPDFToImage;
 import org.apache.pdfbox.text.PDFTextStripper;
 
@@ -464,4 +465,51 @@ public class TestFontEmbedding extends T
             System.err.println("Rendering of " + pdf + " failed or is not 
identical to expected rendering in " + IN_DIR + " directory");
         }
     }
+
+    /**
+     * PDFBOX-5230: Zero-width characters should be invisible.
+     *
+     * @throws IOException
+     */
+    public void testEmbeddedFontWithZeroWidthChars() throws IOException
+    {
+        String text = "AAA\u200CBBB";
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        PDDocument document = new PDDocument();
+        PDPage page = new PDPage();
+        document.addPage(page);
+        InputStream input = PDFont.class.getResourceAsStream(
+                "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
+        PDType0Font font = PDType0Font.load(document, input);
+        PDPageContentStream stream = new PDPageContentStream(document, page);
+        stream.beginText();
+        stream.setFont(font, 20);
+        stream.newLineAtOffset(50, 600);
+        stream.showText(text);
+        stream.endText();
+        stream.close();
+        document.save(baos);
+        document.close();
+        document = PDDocument.load(baos.toByteArray());
+        // verify that the text still contains zero-width characters
+        PDFTextStripper stripper = new PDFTextStripper();
+        String extractedText = stripper.getText(document).trim();
+        assertEquals(text, extractedText);
+        assertEquals(7, extractedText.length());
+        assertEquals('\u200C', extractedText.charAt(3));
+
+        // verify that the zero-width characters are invisible
+        page = document.getPage(0);
+        PDResources resources = page.getResources();
+        Iterable< COSName> fontNames = resources.getFontNames();
+        COSName fontName = fontNames.iterator().next();
+        font = (PDType0Font) resources.getFont(fontName);
+        byte[] encoded = font.encode('\u200C');
+        int code = ((encoded[0] & 0xFF) << 8) | (encoded[1] & 0xFF);
+        assertEquals(0f, font.getWidth(code));
+        assertEquals(0f, font.getWidthFromFont(code));
+        assertTrue(font.getPath(code).getBounds2D().isEmpty());
+        assertFalse(font.isDamaged());
+        document.close();
+    }
 }


Reply via email to