Raymond Wu created PDFBOX-3132:
----------------------------------

             Summary: Cannot extract text which font is Type0 with predefined 
CJK CMap
                 Key: PDFBOX-3132
                 URL: https://issues.apache.org/jira/browse/PDFBOX-3132
             Project: PDFBox
          Issue Type: Improvement
          Components: PDModel
    Affects Versions: 1.8.9
            Reporter: Raymond Wu


20 0 obj
<<
/Type /Font
/BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
/Subtype /Type0
/Encoding /UniGB-UTF16-H
/DescendantFonts [42 0 R]
>>
endobj

If the Type0 font is like above, CJK string cannot produced from 
org.apache.pdfbox.pdmodel.font.PDType0Font.
PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
So Chinese, Japanese, Korean with this font cannot be extracted.

I have tried to modify PDType0Font source like this. It works.

    @Override
    public String encode(byte[] c, int offset, int length) throws IOException
    {
        String retval = null;

        if (hasToUnicode())
        {
            retval = super.encode(c, offset, length);
        }
        
        if (retval == null)
        {
            int result = cmap.lookupCID(c, offset, length);
            if (result != -1)
            {
                retval = descendantFont.cmapEncoding(result, 2, true, null);
            } else {
                // Predefined CJK CMap
                //
                // PDF Source:
                // 20 0 obj
                // <<
                // /Type /Font
                // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
                // /Subtype /Type0
                // /Encoding /UniGB-UTF16-H
                // /DescendantFonts [42 0 R]
                // >>
                // endobj
                //
                
                COSBase encoding = getEncoding();
                if (length == 2 && encoding instanceof COSName)
                {
                                String encname = ((COSName)encoding).getName();
                                String charset = 
charsetOfPredefinedCJKCMap(encname);
                                if (charset!=null) {
                                        retval = new String(c, offset, length, 
charset);
                                }
                }
            }
        }
        
        return retval;
    }

    /**
     * Predefined CJK CMap name to Java charset name
     * 
     * @author Raymond Wu <[email protected]>
     * @param  encname Predefined CJK CMap name
     * @return Java charset name
     */
    public String charsetOfPredefinedCJKCMap(String encname) {
        // PDF 32000-1:2008 Page 274
                // Table 118 – Predefined CJK CMap names
        //
                // @See 
http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
                // @See 
https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html

        // Unicode
                if (encname.contains("UTF16")) return "UTF-16BE";
                if (encname.contains("UCS2"))  return "UTF-16BE";
                
                // Chinese (Traditional)
                // @See https://zh.wikipedia.org/wiki/巴別塔
                if (encname.startsWith("B5pc-"))   return "BIG5";
                if (encname.startsWith("HKscs-"))  return "MS950_HKSCS";
                if (encname.startsWith("ETen-"))   return "MS950";
                if (encname.startsWith("ETenms-")) return "MS950";
                if (encname.startsWith("CNS-"))    return "EUC-TW";

                // Chinese (Simplified)
                if (encname.startsWith("GB-"))    return "MS936";
                if (encname.startsWith("GBpc-"))  return "GB2312";
                if (encname.startsWith("GBK-"))   return "MS936";
                if (encname.startsWith("GBKp-"))  return "MS936";
                if (encname.startsWith("GBK2K-")) return "GB18030";

                // Japanese
                if (encname.startsWith("83pv-"))  return "JISAutoDetect"; // 
JIS X 0208 + KanjiTalk6 (漢字6)
                if (encname.startsWith("90ms-"))  return "JISAutoDetect"; // 
MS932
                if (encname.startsWith("90msp-")) return "JISAutoDetect"; // 
MS932
                if (encname.startsWith("90pv-"))  return "JISAutoDetect"; // 
JIS X 0208 + KanjiTalk7 (漢字7)
                if (encname.startsWith("Add-"))   return "JISAutoDetect"; // 
JIS X 0208 + Fujitsu FMR
                if (encname.startsWith("EUC-"))   return "JISAutoDetect"; // 
JIS X 0208
                if (encname.startsWith("Ext-"))   return "JISAutoDetect"; // 
JIS C 6226 + NEC
                if (encname.equals("H"))          return "JISAutoDetect"; // 
ISO-2022-JP
                if (encname.equals("V"))          return "JISAutoDetect"; // 
ISO-2022-JP

                // Korean
                if (encname.startsWith("KSC-"))   return "EUC_KR";
                if (encname.startsWith("KSCms-")) return "MS949";
                if (encname.startsWith("KSCpc-")) return "EUC_KR";

        return null;
    }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to