[ 
https://issues.apache.org/jira/browse/PDFBOX-3132?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

John Hewson closed PDFBOX-3132.
-------------------------------
    Resolution: Won't Fix

We don't support this in 1.8. Use 2.0 instead.

> Cannot extract text which font is Type0 with predefined CJK CMap
> ----------------------------------------------------------------
>
>                 Key: PDFBOX-3132
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-3132
>             Project: PDFBox
>          Issue Type: Improvement
>          Components: PDModel
>    Affects Versions: 1.8.9
>            Reporter: Raymond Wu
>         Attachments: pdf_font-zhcn.pdf
>
>
> {noformat}
> 20 0 obj
> <<
> /Type /Font
> /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> /Subtype /Type0
> /Encoding /UniGB-UTF16-H
> /DescendantFonts [42 0 R]
> >>
> endobj
> {noformat}
> If the Type0 font is like above, CJK string cannot produced from 
> org.apache.pdfbox.pdmodel.font.PDType0Font.
> PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
> So Chinese, Japanese, Korean with this font cannot be extracted.
> I have tried to modify PDType0Font source like this. It works.
> {code:title=PDType0Font.java|borderStyle=solid}
>     @Override
>     public String encode(byte[] c, int offset, int length) throws IOException
>     {
>         String retval = null;
>         if (hasToUnicode())
>         {
>             retval = super.encode(c, offset, length);
>         }
>         
>         if (retval == null)
>         {
>             int result = cmap.lookupCID(c, offset, length);
>             if (result != -1)
>             {
>                 retval = descendantFont.cmapEncoding(result, 2, true, null);
>             } else {
>               // Predefined CJK CMap
>               //
>               // PDF Source:
>               // 20 0 obj
>               // <<
>               // /Type /Font
>               // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
>               // /Subtype /Type0
>               // /Encoding /UniGB-UTF16-H
>               // /DescendantFonts [42 0 R]
>               // >>
>               // endobj
>               //
>               
>               COSBase encoding = getEncoding();
>               if (length == 2 && encoding instanceof COSName)
>               {
>                               String encname = ((COSName)encoding).getName();
>                               String charset = 
> charsetOfPredefinedCJKCMap(encname);
>                               if (charset!=null) {
>                                       retval = new String(c, offset, length, 
> charset);
>                               }
>               }
>             }
>         }
>         
>         return retval;
>     }
>     /**
>      * Predefined CJK CMap name to Java charset name
>      * 
>      * @author Raymond Wu <[email protected]>
>      * @param  encname Predefined CJK CMap name
>      * @return Java charset name
>      */
>     public String charsetOfPredefinedCJKCMap(String encname) {
>               // PDF 32000-1:2008 Page 274
>               // Table 118 – Predefined CJK CMap names
>               //
>               // @See 
> http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
>               // @See 
> https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
>               // Unicode
>               if (encname.contains("UTF16")) return "UTF-16BE";
>               if (encname.contains("UCS2"))  return "UTF-16BE";
>               
>               // Chinese (Traditional)
>               // @See https://zh.wikipedia.org/wiki/巴別塔
>               if (encname.startsWith("B5pc-"))   return "BIG5";
>               if (encname.startsWith("HKscs-"))  return "MS950_HKSCS";
>               if (encname.startsWith("ETen-"))   return "MS950";
>               if (encname.startsWith("ETenms-")) return "MS950";
>               if (encname.startsWith("CNS-"))    return "EUC-TW";
>               // Chinese (Simplified)
>               if (encname.startsWith("GB-"))    return "MS936";
>               if (encname.startsWith("GBpc-"))  return "GB2312";
>               if (encname.startsWith("GBK-"))   return "MS936";
>               if (encname.startsWith("GBKp-"))  return "MS936";
>               if (encname.startsWith("GBK2K-")) return "GB18030";
>               // Japanese
>               if (encname.startsWith("83pv-"))  return "JISAutoDetect"; // 
> JIS X 0208 + KanjiTalk6 (漢字6)
>               if (encname.startsWith("90ms-"))  return "JISAutoDetect"; // 
> MS932
>               if (encname.startsWith("90msp-")) return "JISAutoDetect"; // 
> MS932
>               if (encname.startsWith("90pv-"))  return "JISAutoDetect"; // 
> JIS X 0208 + KanjiTalk7 (漢字7)
>               if (encname.startsWith("Add-"))   return "JISAutoDetect"; // 
> JIS X 0208 + Fujitsu FMR
>               if (encname.startsWith("EUC-"))   return "JISAutoDetect"; // 
> JIS X 0208
>               if (encname.startsWith("Ext-"))   return "JISAutoDetect"; // 
> JIS C 6226 + NEC
>               if (encname.equals("H"))          return "JISAutoDetect"; // 
> ISO-2022-JP
>               if (encname.equals("V"))          return "JISAutoDetect"; // 
> ISO-2022-JP
>               // Korean
>               if (encname.startsWith("KSC-"))   return "EUC_KR";
>               if (encname.startsWith("KSCms-")) return "MS949";
>               if (encname.startsWith("KSCpc-")) return "EUC_KR";
>       return null;
>     }
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to