Author: lehmi
Date: Wed Mar 10 18:11:12 2010
New Revision: 921494
URL: http://svn.apache.org/viewvc?rev=921494&view=rev
Log:
PDFBOX-654: added the ability to extract CJK text. Patch by Atsuo Ishimoto
(ishimoto at gembook dot org)
Modified:
pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java
pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
Modified: pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java?rev=921494&r1=921493&r2=921494&view=diff
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java (original)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java Wed Mar 10
18:11:12 2010
@@ -149,6 +149,10 @@ public final class COSName extends COSBa
/**
* A common COSName value.
*/
+ public static final COSName CIDSYSTEMINFO = new COSName( "CIDSystemInfo" );
+ /**
+ * A common COSName value.
+ */
public static final COSName COLORSPACE = new COSName( "ColorSpace" );
/**
* A common COSName value.
@@ -465,9 +469,13 @@ public final class COSName extends COSBa
public static final COSName OPEN_ACTION = new COSName("OpenAction");
/**
- * A common COSName value.
- */
- public static final COSName P = new COSName( "P" );
+ * A common COSName value.
+ */
+ public static final COSName ORDERING = new COSName( "Ordering" );
+ /**
+ * A common COSName value.
+ */
+ public static final COSName P = new COSName( "P" );
/**
* A common COSName value.
*/
@@ -509,7 +517,11 @@ public final class COSName extends COSBa
/**
* A common COSName value.
*/
- public static final COSName R = new COSName( "R" );
+ public static final COSName R = new COSName( "R" );
+ /**
+ * A common COSName value.
+ */
+ public static final COSName REGISTRY = new COSName( "Registry" );
/**
* A common COSName value.
*/
Modified:
pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java?rev=921494&r1=921493&r2=921494&view=diff
==============================================================================
---
pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
(original)
+++
pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
Wed Mar 10 18:11:12 2010
@@ -28,7 +28,7 @@ import java.util.HashMap;
public class CMapSubstitution
{
- private static HashMap cmapSubstitutions = new HashMap();
+ private static HashMap<String,String> cmapSubstitutions = new
HashMap<String,String>();
private CMapSubstitution()
{
@@ -63,7 +63,6 @@ public class CMapSubstitution
cmapSubstitutions.put( "90pv-RKSJ-H", "90pv-RKSJ-UCS2");
cmapSubstitutions.put( "UniJIS-UCS2-HW-H", "UniJIS-UCS2-H" );
cmapSubstitutions.put( "Adobe-Japan1-4", "Adobe-Japan1-UCS2");
- cmapSubstitutions.put( "Identity-H", "Adobe-Japan1-UCS2");
}
Modified: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java?rev=921494&r1=921493&r2=921494&view=diff
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
(original)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java Wed
Mar 10 18:11:12 2010
@@ -20,11 +20,14 @@ import org.apache.fontbox.afm.AFMParser;
import org.apache.fontbox.afm.FontMetric;
import org.apache.fontbox.cmap.CMapParser;
import org.apache.fontbox.cmap.CMap;
+import org.apache.pdfbox.encoding.conversion.EncodingConversionManager;
+import org.apache.pdfbox.encoding.conversion.EncodingConverter;
import org.apache.pdfbox.encoding.AFMEncoding;
import org.apache.pdfbox.encoding.DictionaryEncoding;
import org.apache.pdfbox.encoding.Encoding;
import org.apache.pdfbox.encoding.EncodingManager;
+import org.apache.pdfbox.encoding.conversion.CMapSubstitution;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
@@ -395,6 +398,30 @@ public abstract class PDFont implements
if( cmap == null )
{
String cmapName = encodingName.getName();
+ if (encodingName.getName().equals(
COSName.IDENTITY_H.getName() ))
+ {
+ COSArray descendantFontArray =
+ (COSArray)font.getDictionaryObject(
COSName.DESCENDANT_FONTS );
+ if (descendantFontArray != null)
+ {
+ COSDictionary descendantFontDictionary =
+
(COSDictionary)descendantFontArray.getObject( 0 );
+ PDFont descendentFont =
PDFontFactory.createFont( descendantFontDictionary );
+ COSDictionary cidsysteminfo =
+
(COSDictionary)descendentFont.font.getDictionaryObject(COSName.CIDSYSTEMINFO);
+ if (cidsysteminfo != null)
+ {
+ String ordering =
cidsysteminfo.getString(COSName.ORDERING);
+ String registry =
cidsysteminfo.getString(COSName.REGISTRY);
+ cmapName = registry + "-" +
ordering+"-UCS2";
+ }
+ }
+ }
+ else
+ {
+ cmapName = CMapSubstitution.substituteCMap(
cmapName );
+ }
+
String resourceRoot = "Resources/cmap/";
String resourceName = resourceRoot + cmapName;
parseCmap( resourceRoot,
ResourceLoader.loadResource( resourceName ), encodingName );
@@ -442,6 +469,25 @@ public abstract class PDFont implements
{
retval = cmap.lookup( c, offset, length );
}
+
+ COSBase encodingCOS = font.getDictionaryObject(COSName.ENCODING);
+ if ( encodingCOS instanceof COSName )
+ {
+ EncodingConverter converter =
EncodingConversionManager.getConverter(((COSName)encodingCOS).getName());
+ if ( converter != null )
+ {
+ if ( retval != null )
+ {
+ retval = converter.convertString(retval);
+ }
+ else
+ {
+ retval = converter.convertBytes(c, offset, length, cmap);
+ }
+ return retval;
+ }
+ }
+
//if we havn't found a value yet and
//we are still on the first byte and
//there is no cmap or the cmap does not have 2 byte mappings then try
to encode