Author: lehmi
Date: Wed Mar 10 18:11:12 2010
New Revision: 921494

URL: http://svn.apache.org/viewvc?rev=921494&view=rev
Log:
PDFBOX-654: added the ability to extract CJK text. Patch by Atsuo Ishimoto 
(ishimoto at gembook dot org)

Modified:
    pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java
    
pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
    pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java

Modified: pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java?rev=921494&r1=921493&r2=921494&view=diff
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java (original)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/cos/COSName.java Wed Mar 10 
18:11:12 2010
@@ -149,6 +149,10 @@ public final class COSName extends COSBa
     /**
     * A common COSName value.
     */
+    public static final COSName CIDSYSTEMINFO = new COSName( "CIDSystemInfo" );
+    /**
+    * A common COSName value.
+    */
     public static final COSName COLORSPACE = new COSName( "ColorSpace" );
     /**
     * A common COSName value.
@@ -465,9 +469,13 @@ public final class COSName extends COSBa
     public static final COSName OPEN_ACTION = new COSName("OpenAction");
 
     /**
-    * A common COSName value.
-    */
-    public static final COSName P = new COSName( "P" );
+     * A common COSName value.
+     */
+     public static final COSName ORDERING = new COSName( "Ordering" );
+     /**
+      * A common COSName value.
+      */
+      public static final COSName P = new COSName( "P" );
     /**
     * A common COSName value.
     */
@@ -509,7 +517,11 @@ public final class COSName extends COSBa
     /**
      * A common COSName value.
      */
-     public static final COSName R = new COSName( "R" );
+    public static final COSName R = new COSName( "R" );
+     /**
+      * A common COSName value.
+      */
+    public static final COSName REGISTRY = new COSName( "Registry" );
     /**
     * A common COSName value.
     */

Modified: 
pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java?rev=921494&r1=921493&r2=921494&view=diff
==============================================================================
--- 
pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
 (original)
+++ 
pdfbox/trunk/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java
 Wed Mar 10 18:11:12 2010
@@ -28,7 +28,7 @@ import java.util.HashMap;
 public class CMapSubstitution 
 {
 
-    private static HashMap cmapSubstitutions = new HashMap();
+    private static HashMap<String,String> cmapSubstitutions = new 
HashMap<String,String>();
 
     private CMapSubstitution()
     {
@@ -63,7 +63,6 @@ public class CMapSubstitution 
         cmapSubstitutions.put( "90pv-RKSJ-H", "90pv-RKSJ-UCS2");
         cmapSubstitutions.put( "UniJIS-UCS2-HW-H", "UniJIS-UCS2-H" );
         cmapSubstitutions.put( "Adobe-Japan1-4", "Adobe-Japan1-UCS2");
-        cmapSubstitutions.put( "Identity-H", "Adobe-Japan1-UCS2");
 
     }
 

Modified: pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java?rev=921494&r1=921493&r2=921494&view=diff
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java 
(original)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java Wed 
Mar 10 18:11:12 2010
@@ -20,11 +20,14 @@ import org.apache.fontbox.afm.AFMParser;
 import org.apache.fontbox.afm.FontMetric;
 import org.apache.fontbox.cmap.CMapParser;
 import org.apache.fontbox.cmap.CMap;
+import org.apache.pdfbox.encoding.conversion.EncodingConversionManager;
+import org.apache.pdfbox.encoding.conversion.EncodingConverter;
 
 import org.apache.pdfbox.encoding.AFMEncoding;
 import org.apache.pdfbox.encoding.DictionaryEncoding;
 import org.apache.pdfbox.encoding.Encoding;
 import org.apache.pdfbox.encoding.EncodingManager;
+import org.apache.pdfbox.encoding.conversion.CMapSubstitution;
 
 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
@@ -395,6 +398,30 @@ public abstract class PDFont implements 
                         if( cmap == null )
                         {
                             String cmapName = encodingName.getName();
+                            if (encodingName.getName().equals( 
COSName.IDENTITY_H.getName() )) 
+                            {
+                                COSArray descendantFontArray =
+                                    (COSArray)font.getDictionaryObject( 
COSName.DESCENDANT_FONTS );
+                                if (descendantFontArray != null) 
+                                {
+                                    COSDictionary descendantFontDictionary = 
+                                        
(COSDictionary)descendantFontArray.getObject( 0 );
+                                    PDFont descendentFont = 
PDFontFactory.createFont( descendantFontDictionary );
+                                    COSDictionary cidsysteminfo = 
+                                        
(COSDictionary)descendentFont.font.getDictionaryObject(COSName.CIDSYSTEMINFO);
+                                    if (cidsysteminfo != null) 
+                                    {
+                                        String ordering = 
cidsysteminfo.getString(COSName.ORDERING);
+                                        String registry = 
cidsysteminfo.getString(COSName.REGISTRY);
+                                        cmapName = registry + "-" + 
ordering+"-UCS2";
+                                    }
+                                }
+                            } 
+                            else 
+                            {
+                                cmapName = CMapSubstitution.substituteCMap( 
cmapName );
+                            }
+                            
                             String resourceRoot = "Resources/cmap/";
                             String resourceName = resourceRoot + cmapName;
                             parseCmap( resourceRoot, 
ResourceLoader.loadResource( resourceName ), encodingName );
@@ -442,6 +469,25 @@ public abstract class PDFont implements 
         {
             retval = cmap.lookup( c, offset, length );
         }
+        
+        COSBase encodingCOS = font.getDictionaryObject(COSName.ENCODING);
+        if ( encodingCOS instanceof COSName ) 
+        {
+            EncodingConverter converter = 
EncodingConversionManager.getConverter(((COSName)encodingCOS).getName());
+            if ( converter != null ) 
+            {
+                if ( retval != null )
+                {
+                    retval = converter.convertString(retval);
+                }
+                else
+                {
+                    retval = converter.convertBytes(c, offset, length, cmap);
+                }
+                return retval;
+            }
+        }
+        
         //if we havn't found a value yet and
         //we are still on the first byte and
         //there is no cmap or the cmap does not have 2 byte mappings then try 
to encode


Reply via email to