Author: jeremias
Date: Sun Jan 4 23:47:02 2009
New Revision: 731479
URL: http://svn.apache.org/viewvc?rev=731479&view=rev
Log:
FOP now creates ToUnicode CMaps for single-byte fonts that don't use built-in
encodings to help PDF text extractors interpreting characters.
PDF CMaps now support single-byte characters.
Modified:
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
xmlgraphics/fop/trunk/status.xml
Modified:
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
---
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
(original)
+++
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
Sun Jan 4 23:47:02 2009
@@ -189,12 +189,7 @@
return this.unicodeMap[idx];
}
- /**
- * Returns a character array with Unicode scalar values which can be used
to map encoding
- * code points to Unicode values. Note that this does not return all
possible Unicode values
- * that the encoding maps.
- * @return a character array with Unicode scalar values
- */
+ /** {...@inheritdoc} */
public final char[] getUnicodeCharMap() {
char[] copy = new char[this.unicodeMap.length];
System.arraycopy(this.unicodeMap, 0, copy, 0, this.unicodeMap.length);
Modified:
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
---
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
(original)
+++
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
Sun Jan 4 23:47:02 2009
@@ -25,6 +25,8 @@
import org.apache.xmlgraphics.fonts.Glyphs;
+import org.apache.fop.util.CharUtilities;
+
/**
* A simple implementation of the OneByteEncoding mostly used for encodings
that are constructed
* on-the-fly.
@@ -138,6 +140,18 @@
}
/** {...@inheritdoc} */
+ public char[] getUnicodeCharMap() {
+ char[] map = new char[getLastChar() + 1];
+ for (int i = 0; i < getFirstChar(); i++) {
+ map[i] = CharUtilities.NOT_A_CHARACTER;
+ }
+ for (int i = getFirstChar(); i <= getLastChar(); i++) {
+ map[i] = getCharacterForIndex(i).getSingleUnicodeValue();
+ }
+ return map;
+ }
+
+ /** {...@inheritdoc} */
public String toString() {
return getName() + " (" + getSize() + " chars)";
}
Modified:
xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java
(original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/fonts/SingleByteEncoding.java
Sun Jan 4 23:47:02 2009
@@ -47,4 +47,12 @@
*/
String[] getCharNameMap();
+ /**
+ * Returns a character array with Unicode scalar values which can be used
to map encoding
+ * code points to Unicode values. Note that this does not return all
possible Unicode values
+ * that the encoding maps.
+ * @return a character array with Unicode scalar values
+ */
+ char[] getUnicodeCharMap();
+
}
Modified: xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java
(original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/CMapBuilder.java Sun Jan
4 23:47:02 2009
@@ -110,8 +110,16 @@
}
protected void writeCodeSpaceRange() throws IOException {
+ writeCodeSpaceRange(false);
+ }
+
+ protected void writeCodeSpaceRange(boolean singleByte) throws IOException {
writer.write("1 begincodespacerange\n");
- writer.write("<0000> <FFFF>\n");
+ if (singleByte) {
+ writer.write("<00> <FF>\n");
+ } else {
+ writer.write("<0000> <FFFF>\n");
+ }
writer.write("endcodespacerange\n");
}
Modified: xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java (original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFactory.java Sun Jan
4 23:47:02 2009
@@ -1227,10 +1227,23 @@
return preRegisteredfont;
}
+ boolean forceToUnicode = true;
+
if (descriptor == null) {
//Usually Base 14 fonts
PDFFont font = new PDFFont(fontname, FontType.TYPE1, basefont,
encoding);
getDocument().registerObject(font);
+ if (forceToUnicode && !PDFEncoding.isPredefinedEncoding(encoding))
{
+ SingleByteEncoding mapping;
+ if (encoding != null) {
+ mapping = CodePointMapping.getMapping(encoding);
+ } else {
+ //for Symbol and ZapfDingbats where encoding must be null
in PDF
+ Typeface tf = (Typeface)metrics;
+ mapping =
CodePointMapping.getMapping(tf.getEncodingName());
+ }
+ generateToUnicodeCmap(font, mapping);
+ }
return font;
} else {
FontType fonttype = metrics.getFontType();
@@ -1266,7 +1279,7 @@
"fop-ucs-H",
new PDFCIDSystemInfo("Adobe",
"Identity",
- 0));
+ 0), false);
getDocument().registerObject(cmap);
((PDFFontType0)font).setCMAP(cmap);
((PDFFontType0)font).setDescendantFonts(cidFont);
@@ -1290,8 +1303,13 @@
SingleByteEncoding mapping = singleByteFont.getEncoding();
if (singleByteFont.isSymbolicFont()) {
//no encoding, use the font's encoding
+ if (forceToUnicode) {
+ generateToUnicodeCmap(nonBase14, mapping);
+ }
} else if
(PDFEncoding.isPredefinedEncoding(mapping.getName())) {
font.setEncoding(mapping.getName());
+ //No ToUnicode CMap necessary if PDF 1.4, chapter 5.9
(page 368) is to be
+ //believed.
} else {
Object pdfEncoding = createPDFEncoding(mapping,
singleByteFont.getFontName());
@@ -1300,16 +1318,9 @@
} else {
font.setEncoding((String)pdfEncoding);
}
-
- /* JM: What I thought would be a necessity with custom
encodings turned out to
- * be a bug in Adobe Acrobat 8. The following section just
demonstrates how
- * to generate a ToUnicode CMap for a Type 1 font.
- PDFCMap cmap = new
PDFToUnicodeCMap(mapping.getUnicodeCharMap(),
- "fop-ucs-H",
- new PDFCIDSystemInfo("Adobe", "Identity", 0));
- getDocument().registerObject(cmap);
- nonBase14.setToUnicode(cmap);
- */
+ if (forceToUnicode) {
+ generateToUnicodeCmap(nonBase14, mapping);
+ }
}
//Handle additional encodings (characters outside the primary
encoding)
@@ -1330,6 +1341,9 @@
new PDFArray(null,
singleByteFont.getAdditionalWidths(i)));
getDocument().registerObject(addFont);
getDocument().getResources().addFont(addFont);
+ if (forceToUnicode) {
+ generateToUnicodeCmap(addFont, addEncoding);
+ }
}
}
}
@@ -1338,6 +1352,14 @@
}
}
+ private void generateToUnicodeCmap(PDFFont font, SingleByteEncoding
encoding) {
+ PDFCMap cmap = new PDFToUnicodeCMap(encoding.getUnicodeCharMap(),
+ "fop-ucs-H",
+ new PDFCIDSystemInfo("Adobe", "Identity", 0), true);
+ getDocument().registerObject(cmap);
+ font.setToUnicode(cmap);
+ }
+
/**
* Creates a PDFEncoding instance from a CodePointMapping instance.
* @param encoding the code point mapping (encoding)
Modified: xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java (original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFont.java Sun Jan 4
23:47:02 2009
@@ -86,6 +86,14 @@
}
/**
+ * Sets a ToUnicode CMap.
+ * @param cmap the ToUnicode character map
+ */
+ public void setToUnicode(PDFCMap cmap) {
+ put("ToUnicode", cmap);
+ }
+
+ /**
* factory method with the basic parameters
*
* @param fontname the internal name for the font
Modified:
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
(original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFFontNonBase14.java Sun
Jan 4 23:47:02 2009
@@ -71,14 +71,6 @@
return (PDFFontDescriptor)get("FontDescriptor");
}
- /**
- * Sets a ToUnicode CMap.
- * @param cmap the ToUnicode character map
- */
- public void setToUnicode(PDFCMap cmap) {
- put("ToUnicode", cmap);
- }
-
/** {...@inheritdoc} */
protected void validate() {
if (getDocumentSafely().getProfile().isFontEmbeddingRequired()) {
Modified:
xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
(original)
+++ xmlgraphics/fop/trunk/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java Sun
Jan 4 23:47:02 2009
@@ -43,6 +43,8 @@
*/
protected char[] unicodeCharMap;
+ private boolean singleByte;
+
/**
* Constructor.
*
@@ -51,10 +53,17 @@
* @param name One of the registered names found in Table 5.14 in PDF
* Reference, Second Edition.
* @param sysInfo The attributes of the character collection of the
CIDFont.
+ * @param singleByte true for single-byte, false for double-byte
*/
- public PDFToUnicodeCMap(char[] unicodeCharMap, String name,
PDFCIDSystemInfo sysInfo) {
+ public PDFToUnicodeCMap(char[] unicodeCharMap, String name,
PDFCIDSystemInfo sysInfo,
+ boolean singleByte) {
super(name, sysInfo);
+ if (singleByte && unicodeCharMap.length > 256) {
+ throw new IllegalArgumentException("unicodeCharMap may not contain
more than"
+ + " 256 characters for single-byte encodings");
+ }
this.unicodeCharMap = unicodeCharMap;
+ this.singleByte = singleByte;
}
/** {...@inheritdoc} */
@@ -78,7 +87,7 @@
writeCIDSystemInfo("Adobe", "UCS", 0);
writeName("Adobe-Identity-UCS");
writeType("2");
- writeCodeSpaceRange();
+ writeCodeSpaceRange(singleByte);
writeBFEntries();
writeWrapUp();
}
@@ -122,7 +131,7 @@
while (partOfRange(charArray, charIndex)) {
charIndex++;
}
- writer.write("<" +
padHexString(Integer.toHexString(charIndex), 4) + "> ");
+ writer.write("<" + padCharIndex(charIndex) + "> ");
writer.write("<" +
padHexString(Integer.toHexString(charArray[charIndex]), 4)
+ ">\n");
charIndex++;
@@ -132,6 +141,10 @@
} while (remainingEntries > 0);
}
+ private String padCharIndex(int charIndex) {
+ return padHexString(Integer.toHexString(charIndex), (singleByte ?
2 : 4));
+ }
+
/**
* Writes the entries for character ranges for a base font.
* @param p StringBuffer to write to
@@ -159,9 +172,9 @@
while (!startOfRange(charArray, charIndex)) {
charIndex++;
}
- writer.write("<" +
padHexString(Integer.toHexString(charIndex), 4) + "> ");
+ writer.write("<" + padCharIndex(charIndex) + "> ");
writer.write("<"
- +
padHexString(Integer.toHexString(endOfRange(charArray, charIndex)), 4)
+ + padCharIndex(endOfRange(charArray, charIndex))
+ "> ");
writer.write("<" +
padHexString(Integer.toHexString(charArray[charIndex]), 4)
+ ">\n");
Modified: xmlgraphics/fop/trunk/status.xml
URL:
http://svn.apache.org/viewvc/xmlgraphics/fop/trunk/status.xml?rev=731479&r1=731478&r2=731479&view=diff
==============================================================================
--- xmlgraphics/fop/trunk/status.xml (original)
+++ xmlgraphics/fop/trunk/status.xml Sun Jan 4 23:47:02 2009
@@ -54,6 +54,10 @@
<changes>
<release version="FOP Trunk" date="TBD">
<action context="Fonts" dev="JM" type="add">
+ FOP now creates ToUnicode CMaps for single-byte fonts that don't use
built-in
+ encodings to help PDF text extractors interpreting characters.
+ </action>
+ <action context="Fonts" dev="JM" type="add">
Added support for forcing single-byte encodings for TrueType fonts
without
creating an XML font metric file (see "encoding-mode" attribute on
"font" element)
</action>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]