Author: jeremias
Date: Sun Jan  4 23:47:02 2009
New Revision: 731479

URL: http://svn.apache.org/viewvc?rev=731479&view=rev
Log:
FOP now creates ToUnicode CMaps for single-byte fonts that don't use built-in 
encodings to help PDF text extractors interpreting characters.
PDF CMaps now support single-byte characters.

Modified:
    
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
    
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
    xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java
    xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java
    xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java
    xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java
    xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
    xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
    xmlgraphics/fop/trunk/status.xml

Modified: 
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- 
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
 (original)
+++ 
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
 Sun Jan  4 23:47:02 2009
@@ -189,12 +189,7 @@
         return this.unicodeMap[idx];
     }
 
-    /**
-     * Returns a character array with Unicode scalar values which can be used 
to map encoding
-     * code points to Unicode values. Note that this does not return all 
possible Unicode values
-     * that the encoding maps.
-     * @return a character array with Unicode scalar values
-     */
+    /** {...@inheritdoc} */
     public final char[] getUnicodeCharMap() {
         char[] copy = new char[this.unicodeMap.length];
         System.arraycopy(this.unicodeMap, 0, copy, 0, this.unicodeMap.length);

Modified: 
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- 
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
 (original)
+++ 
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
 Sun Jan  4 23:47:02 2009
@@ -25,6 +25,8 @@
 
 import org.apache.xmlgraphics.fonts.Glyphs;
 
+import org.apache.fop.util.CharUtilities;
+
 /**
  * A simple implementation of the OneByteEncoding mostly used for encodings 
that are constructed
  * on-the-fly.
@@ -138,6 +140,18 @@
     }
 
     /** {...@inheritdoc} */
+    public char[] getUnicodeCharMap() {
+        char[] map = new char[getLastChar() + 1];
+        for (int i = 0; i < getFirstChar(); i++) {
+            map[i] = CharUtilities.NOT_A_CHARACTER;
+        }
+        for (int i = getFirstChar(); i <= getLastChar(); i++) {
+            map[i] = getCharacterForIndex(i).getSingleUnicodeValue();
+        }
+        return map;
+    }
+
+    /** {...@inheritdoc} */
     public String toString() {
         return getName() + " (" + getSize() + " chars)";
     }

Modified: 
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java 
(original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java 
Sun Jan  4 23:47:02 2009
@@ -47,4 +47,12 @@
      */
     String[] getCharNameMap();
 
+    /**
+     * Returns a character array with Unicode scalar values which can be used 
to map encoding
+     * code points to Unicode values. Note that this does not return all 
possible Unicode values
+     * that the encoding maps.
+     * @return a character array with Unicode scalar values
+     */
+    char[] getUnicodeCharMap();
+
 }

Modified: xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java 
(original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java Sun Jan  
4 23:47:02 2009
@@ -110,8 +110,16 @@
     }
 
     protected void writeCodeSpaceRange() throws IOException {
+        writeCodeSpaceRange(false);
+    }
+
+    protected void writeCodeSpaceRange(boolean singleByte) throws IOException {
         writer.write("1 begincodespacerange\n");
-        writer.write("<0000> <FFFF>\n");
+        if (singleByte) {
+            writer.write("<00> <FF>\n");
+        } else {
+            writer.write("<0000> <FFFF>\n");
+        }
         writer.write("endcodespacerange\n");
     }
 

Modified: xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java (original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java Sun Jan  
4 23:47:02 2009
@@ -1227,10 +1227,23 @@
             return preRegisteredfont;
         }
 
+        boolean forceToUnicode = true;
+
         if (descriptor == null) {
             //Usually Base 14 fonts
             PDFFont font = new PDFFont(fontname, FontType.TYPE1, basefont, 
encoding);
             getDocument().registerObject(font);
+            if (forceToUnicode && !PDFEncoding.isPredefinedEncoding(encoding)) 
{
+                SingleByteEncoding mapping;
+                if (encoding != null) {
+                    mapping = CodePointMapping.getMapping(encoding);
+                } else {
+                    //for Symbol and ZapfDingbats where encoding must be null 
in PDF
+                    Typeface tf = (Typeface)metrics;
+                    mapping = 
CodePointMapping.getMapping(tf.getEncodingName());
+                }
+                generateToUnicodeCmap(font, mapping);
+            }
             return font;
         } else {
             FontType fonttype = metrics.getFontType();
@@ -1266,7 +1279,7 @@
                         "fop-ucs-H",
                         new PDFCIDSystemInfo("Adobe",
                             "Identity",
-                            0));
+                            0), false);
                 getDocument().registerObject(cmap);
                 ((PDFFontType0)font).setCMAP(cmap);
                 ((PDFFontType0)font).setDescendantFonts(cidFont);
@@ -1290,8 +1303,13 @@
                 SingleByteEncoding mapping = singleByteFont.getEncoding();
                 if (singleByteFont.isSymbolicFont()) {
                     //no encoding, use the font's encoding
+                    if (forceToUnicode) {
+                        generateToUnicodeCmap(nonBase14, mapping);
+                    }
                 } else if 
(PDFEncoding.isPredefinedEncoding(mapping.getName())) {
                     font.setEncoding(mapping.getName());
+                    //No ToUnicode CMap necessary if PDF 1.4, chapter 5.9 
(page 368) is to be
+                    //believed.
                 } else {
                     Object pdfEncoding = createPDFEncoding(mapping,
                             singleByteFont.getFontName());
@@ -1300,16 +1318,9 @@
                     } else {
                         font.setEncoding((String)pdfEncoding);
                     }
-
-                    /* JM: What I thought would be a necessity with custom 
encodings turned out to
-                     * be a bug in Adobe Acrobat 8. The following section just 
demonstrates how
-                     * to generate a ToUnicode CMap for a Type 1 font.
-                    PDFCMap cmap = new 
PDFToUnicodeCMap(mapping.getUnicodeCharMap(),
-                            "fop-ucs-H",
-                            new PDFCIDSystemInfo("Adobe", "Identity", 0));
-                    getDocument().registerObject(cmap);
-                    nonBase14.setToUnicode(cmap);
-                    */
+                    if (forceToUnicode) {
+                        generateToUnicodeCmap(nonBase14, mapping);
+                    }
                 }
 
                 //Handle additional encodings (characters outside the primary 
encoding)
@@ -1330,6 +1341,9 @@
                                 new PDFArray(null, 
singleByteFont.getAdditionalWidths(i)));
                         getDocument().registerObject(addFont);
                         getDocument().getResources().addFont(addFont);
+                        if (forceToUnicode) {
+                            generateToUnicodeCmap(addFont, addEncoding);
+                        }
                     }
                 }
             }
@@ -1338,6 +1352,14 @@
         }
     }
 
+    private void generateToUnicodeCmap(PDFFont font, SingleByteEncoding 
encoding) {
+        PDFCMap cmap = new PDFToUnicodeCMap(encoding.getUnicodeCharMap(),
+                "fop-ucs-H",
+                new PDFCIDSystemInfo("Adobe", "Identity", 0), true);
+        getDocument().registerObject(cmap);
+        font.setToUnicode(cmap);
+    }
+
     /**
      * Creates a PDFEncoding instance from a CodePointMapping instance.
      * @param encoding the code point mapping (encoding)

Modified: xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java (original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java Sun Jan  4 
23:47:02 2009
@@ -86,6 +86,14 @@
     }
 
     /**
+     * Sets a ToUnicode CMap.
+     * @param cmap the ToUnicode character map
+     */
+    public void setToUnicode(PDFCMap cmap) {
+        put("ToUnicode", cmap);
+    }
+
+    /**
      * factory method with the basic parameters
      *
      * @param fontname the internal name for the font

Modified: 
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java 
(original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java Sun 
Jan  4 23:47:02 2009
@@ -71,14 +71,6 @@
         return (PDFFontDescriptor)get("FontDescriptor");
     }
 
-    /**
-     * Sets a ToUnicode CMap.
-     * @param cmap the ToUnicode character map
-     */
-    public void setToUnicode(PDFCMap cmap) {
-        put("ToUnicode", cmap);
-    }
-
     /** {...@inheritdoc} */
     protected void validate() {
         if (getDocumentSafely().getProfile().isFontEmbeddingRequired()) {

Modified: 
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java 
(original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java Sun 
Jan  4 23:47:02 2009
@@ -43,6 +43,8 @@
      */
     protected char[] unicodeCharMap;
 
+    private boolean singleByte;
+
     /**
      * Constructor.
      *
@@ -51,10 +53,17 @@
      * @param name One of the registered names found in Table 5.14 in PDF
      * Reference, Second Edition.
      * @param sysInfo The attributes of the character collection of the 
CIDFont.
+     * @param singleByte true for single-byte, false for double-byte
      */
-    public PDFToUnicodeCMap(char[] unicodeCharMap, String name, 
PDFCIDSystemInfo sysInfo) {
+    public PDFToUnicodeCMap(char[] unicodeCharMap, String name, 
PDFCIDSystemInfo sysInfo,
+            boolean singleByte) {
         super(name, sysInfo);
+        if (singleByte && unicodeCharMap.length > 256) {
+            throw new IllegalArgumentException("unicodeCharMap may not contain 
more than"
+                    + " 256 characters for single-byte encodings");
+        }
         this.unicodeCharMap = unicodeCharMap;
+        this.singleByte = singleByte;
     }
 
     /** {...@inheritdoc} */
@@ -78,7 +87,7 @@
             writeCIDSystemInfo("Adobe", "UCS", 0);
             writeName("Adobe-Identity-UCS");
             writeType("2");
-            writeCodeSpaceRange();
+            writeCodeSpaceRange(singleByte);
             writeBFEntries();
             writeWrapUp();
         }
@@ -122,7 +131,7 @@
                     while (partOfRange(charArray, charIndex)) {
                         charIndex++;
                     }
-                    writer.write("<" + 
padHexString(Integer.toHexString(charIndex), 4) + "> ");
+                    writer.write("<" + padCharIndex(charIndex) + "> ");
                     writer.write("<" + 
padHexString(Integer.toHexString(charArray[charIndex]), 4)
                             + ">\n");
                     charIndex++;
@@ -132,6 +141,10 @@
             } while (remainingEntries > 0);
         }
 
+        private String padCharIndex(int charIndex) {
+            return padHexString(Integer.toHexString(charIndex), (singleByte ? 
2 : 4));
+        }
+
         /**
          * Writes the entries for character ranges for a base font.
          * @param p StringBuffer to write to
@@ -159,9 +172,9 @@
                     while (!startOfRange(charArray, charIndex)) {
                         charIndex++;
                     }
-                    writer.write("<" + 
padHexString(Integer.toHexString(charIndex), 4) + "> ");
+                    writer.write("<" + padCharIndex(charIndex) + "> ");
                     writer.write("<"
-                            + 
padHexString(Integer.toHexString(endOfRange(charArray, charIndex)), 4)
+                            + padCharIndex(endOfRange(charArray, charIndex))
                             + "> ");
                     writer.write("<" + 
padHexString(Integer.toHexString(charArray[charIndex]), 4)
                             + ">\n");

Modified: xmlgraphics/fop/trunk/status.xml
URL: 
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/status.xml?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/status.xml (original)
+++ xmlgraphics/fop/trunk/status.xml Sun Jan  4 23:47:02 2009
@@ -54,6 +54,10 @@
   <changes>
     <release version="FOP Trunk" date="TBD">
       <action context="Fonts" dev="JM" type="add">
+        FOP now creates ToUnicode CMaps for single-byte fonts that don't use 
built-in
+        encodings to help PDF text extractors interpreting characters.
+      </action>
+      <action context="Fonts" dev="JM" type="add">
         Added support for forcing single-byte encodings for TrueType fonts 
without
         creating an XML font metric file (see "encoding-mode" attribute on 
"font" element)
       </action>



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to