[ 
https://issues.apache.org/jira/browse/PDFBOX-3132?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Raymond Wu updated PDFBOX-3132:
-------------------------------
    Description: 
{noformat}
20 0 obj
<<
/Type /Font
/BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
/Subtype /Type0
/Encoding /UniGB-UTF16-H
/DescendantFonts [42 0 R]
>>
endobj
{noformat}

If the Type0 font is like above, CJK string cannot produced from 
org.apache.pdfbox.pdmodel.font.PDType0Font.
PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
So Chinese, Japanese, Korean with this font cannot be extracted.

I have tried to modify PDType0Font source like this. It works.

{code:title=PDType0Font.java|borderStyle=solid}
    @Override
    public String encode(byte[] c, int offset, int length) throws IOException
    {
        String retval = null;

        if (hasToUnicode())
        {
            retval = super.encode(c, offset, length);
        }
        
        if (retval == null)
        {
            int result = cmap.lookupCID(c, offset, length);
            if (result != -1)
            {
                retval = descendantFont.cmapEncoding(result, 2, true, null);
            } else {
                // Predefined CJK CMap
                //
                // PDF Source:
                // 20 0 obj
                // <<
                // /Type /Font
                // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
                // /Subtype /Type0
                // /Encoding /UniGB-UTF16-H
                // /DescendantFonts [42 0 R]
                // >>
                // endobj
                //
                
                COSBase encoding = getEncoding();
                if (length == 2 && encoding instanceof COSName)
                {
                                String encname = ((COSName)encoding).getName();
                                String charset = 
charsetOfPredefinedCJKCMap(encname);
                                if (charset!=null) {
                                        retval = new String(c, offset, length, 
charset);
                                }
                }
            }
        }
        
        return retval;
    }

    /**
     * Predefined CJK CMap name to Java charset name
     * 
     * @author Raymond Wu <[email protected]>
     * @param  encname Predefined CJK CMap name
     * @return Java charset name
     */
    public String charsetOfPredefinedCJKCMap(String encname) {
                // PDF 32000-1:2008 Page 274
                // Table 118 – Predefined CJK CMap names
                //
                // @See 
http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
                // @See 
https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html

                // Unicode
                if (encname.contains("UTF16")) return "UTF-16BE";
                if (encname.contains("UCS2"))  return "UTF-16BE";
                
                // Chinese (Traditional)
                // @See https://zh.wikipedia.org/wiki/巴別塔
                if (encname.startsWith("B5pc-"))   return "BIG5";
                if (encname.startsWith("HKscs-"))  return "MS950_HKSCS";
                if (encname.startsWith("ETen-"))   return "MS950";
                if (encname.startsWith("ETenms-")) return "MS950";
                if (encname.startsWith("CNS-"))    return "EUC-TW";

                // Chinese (Simplified)
                if (encname.startsWith("GB-"))    return "MS936";
                if (encname.startsWith("GBpc-"))  return "GB2312";
                if (encname.startsWith("GBK-"))   return "MS936";
                if (encname.startsWith("GBKp-"))  return "MS936";
                if (encname.startsWith("GBK2K-")) return "GB18030";

                // Japanese
                if (encname.startsWith("83pv-"))  return "JISAutoDetect"; // 
JIS X 0208 + KanjiTalk6 (漢字6)
                if (encname.startsWith("90ms-"))  return "JISAutoDetect"; // 
MS932
                if (encname.startsWith("90msp-")) return "JISAutoDetect"; // 
MS932
                if (encname.startsWith("90pv-"))  return "JISAutoDetect"; // 
JIS X 0208 + KanjiTalk7 (漢字7)
                if (encname.startsWith("Add-"))   return "JISAutoDetect"; // 
JIS X 0208 + Fujitsu FMR
                if (encname.startsWith("EUC-"))   return "JISAutoDetect"; // 
JIS X 0208
                if (encname.startsWith("Ext-"))   return "JISAutoDetect"; // 
JIS C 6226 + NEC
                if (encname.equals("H"))          return "JISAutoDetect"; // 
ISO-2022-JP
                if (encname.equals("V"))          return "JISAutoDetect"; // 
ISO-2022-JP

                // Korean
                if (encname.startsWith("KSC-"))   return "EUC_KR";
                if (encname.startsWith("KSCms-")) return "MS949";
                if (encname.startsWith("KSCpc-")) return "EUC_KR";

        return null;
    }
{code}

  was:
{noformat}
20 0 obj
<<
/Type /Font
/BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
/Subtype /Type0
/Encoding /UniGB-UTF16-H
/DescendantFonts [42 0 R]
>>
endobj
{noformat}

If the Type0 font is like above, CJK string cannot produced from 
org.apache.pdfbox.pdmodel.font.PDType0Font.
PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
So Chinese, Japanese, Korean with this font cannot be extracted.

I have tried to modify PDType0Font source like this. It works.

{code:title=PDType0Font.java|borderStyle=solid}
    @Override
    public String encode(byte[] c, int offset, int length) throws IOException
    {
        String retval = null;

        if (hasToUnicode())
        {
            retval = super.encode(c, offset, length);
        }
        
        if (retval == null)
        {
            int result = cmap.lookupCID(c, offset, length);
            if (result != -1)
            {
                retval = descendantFont.cmapEncoding(result, 2, true, null);
            } else {
                // Predefined CJK CMap
                //
                // PDF Source:
                // 20 0 obj
                // <<
                // /Type /Font
                // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
                // /Subtype /Type0
                // /Encoding /UniGB-UTF16-H
                // /DescendantFonts [42 0 R]
                // >>
                // endobj
                //
                
                COSBase encoding = getEncoding();
                if (length == 2 && encoding instanceof COSName)
                {
                                String encname = ((COSName)encoding).getName();
                                String charset = 
charsetOfPredefinedCJKCMap(encname);
                                if (charset!=null) {
                                        retval = new String(c, offset, length, 
charset);
                                }
                }
            }
        }
        
        return retval;
    }

    /**
     * Predefined CJK CMap name to Java charset name
     * 
     * @author Raymond Wu <[email protected]>
     * @param  encname Predefined CJK CMap name
     * @return Java charset name
     */
    public String charsetOfPredefinedCJKCMap(String encname) {
        // PDF 32000-1:2008 Page 274
                // Table 118 – Predefined CJK CMap names
        //
                // @See 
http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
                // @See 
https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html

        // Unicode
                if (encname.contains("UTF16")) return "UTF-16BE";
                if (encname.contains("UCS2"))  return "UTF-16BE";
                
                // Chinese (Traditional)
                // @See https://zh.wikipedia.org/wiki/巴別塔
                if (encname.startsWith("B5pc-"))   return "BIG5";
                if (encname.startsWith("HKscs-"))  return "MS950_HKSCS";
                if (encname.startsWith("ETen-"))   return "MS950";
                if (encname.startsWith("ETenms-")) return "MS950";
                if (encname.startsWith("CNS-"))    return "EUC-TW";

                // Chinese (Simplified)
                if (encname.startsWith("GB-"))    return "MS936";
                if (encname.startsWith("GBpc-"))  return "GB2312";
                if (encname.startsWith("GBK-"))   return "MS936";
                if (encname.startsWith("GBKp-"))  return "MS936";
                if (encname.startsWith("GBK2K-")) return "GB18030";

                // Japanese
                if (encname.startsWith("83pv-"))  return "JISAutoDetect"; // 
JIS X 0208 + KanjiTalk6 (漢字6)
                if (encname.startsWith("90ms-"))  return "JISAutoDetect"; // 
MS932
                if (encname.startsWith("90msp-")) return "JISAutoDetect"; // 
MS932
                if (encname.startsWith("90pv-"))  return "JISAutoDetect"; // 
JIS X 0208 + KanjiTalk7 (漢字7)
                if (encname.startsWith("Add-"))   return "JISAutoDetect"; // 
JIS X 0208 + Fujitsu FMR
                if (encname.startsWith("EUC-"))   return "JISAutoDetect"; // 
JIS X 0208
                if (encname.startsWith("Ext-"))   return "JISAutoDetect"; // 
JIS C 6226 + NEC
                if (encname.equals("H"))          return "JISAutoDetect"; // 
ISO-2022-JP
                if (encname.equals("V"))          return "JISAutoDetect"; // 
ISO-2022-JP

                // Korean
                if (encname.startsWith("KSC-"))   return "EUC_KR";
                if (encname.startsWith("KSCms-")) return "MS949";
                if (encname.startsWith("KSCpc-")) return "EUC_KR";

        return null;
    }
{code}


> Cannot extract text which font is Type0 with predefined CJK CMap
> ----------------------------------------------------------------
>
>                 Key: PDFBOX-3132
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-3132
>             Project: PDFBox
>          Issue Type: Improvement
>          Components: PDModel
>    Affects Versions: 1.8.9
>            Reporter: Raymond Wu
>
> {noformat}
> 20 0 obj
> <<
> /Type /Font
> /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> /Subtype /Type0
> /Encoding /UniGB-UTF16-H
> /DescendantFonts [42 0 R]
> >>
> endobj
> {noformat}
> If the Type0 font is like above, CJK string cannot produced from 
> org.apache.pdfbox.pdmodel.font.PDType0Font.
> PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
> So Chinese, Japanese, Korean with this font cannot be extracted.
> I have tried to modify PDType0Font source like this. It works.
> {code:title=PDType0Font.java|borderStyle=solid}
>     @Override
>     public String encode(byte[] c, int offset, int length) throws IOException
>     {
>         String retval = null;
>         if (hasToUnicode())
>         {
>             retval = super.encode(c, offset, length);
>         }
>         
>         if (retval == null)
>         {
>             int result = cmap.lookupCID(c, offset, length);
>             if (result != -1)
>             {
>                 retval = descendantFont.cmapEncoding(result, 2, true, null);
>             } else {
>               // Predefined CJK CMap
>               //
>               // PDF Source:
>               // 20 0 obj
>               // <<
>               // /Type /Font
>               // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
>               // /Subtype /Type0
>               // /Encoding /UniGB-UTF16-H
>               // /DescendantFonts [42 0 R]
>               // >>
>               // endobj
>               //
>               
>               COSBase encoding = getEncoding();
>               if (length == 2 && encoding instanceof COSName)
>               {
>                               String encname = ((COSName)encoding).getName();
>                               String charset = 
> charsetOfPredefinedCJKCMap(encname);
>                               if (charset!=null) {
>                                       retval = new String(c, offset, length, 
> charset);
>                               }
>               }
>             }
>         }
>         
>         return retval;
>     }
>     /**
>      * Predefined CJK CMap name to Java charset name
>      * 
>      * @author Raymond Wu <[email protected]>
>      * @param  encname Predefined CJK CMap name
>      * @return Java charset name
>      */
>     public String charsetOfPredefinedCJKCMap(String encname) {
>               // PDF 32000-1:2008 Page 274
>               // Table 118 – Predefined CJK CMap names
>               //
>               // @See 
> http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
>               // @See 
> https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
>               // Unicode
>               if (encname.contains("UTF16")) return "UTF-16BE";
>               if (encname.contains("UCS2"))  return "UTF-16BE";
>               
>               // Chinese (Traditional)
>               // @See https://zh.wikipedia.org/wiki/巴別塔
>               if (encname.startsWith("B5pc-"))   return "BIG5";
>               if (encname.startsWith("HKscs-"))  return "MS950_HKSCS";
>               if (encname.startsWith("ETen-"))   return "MS950";
>               if (encname.startsWith("ETenms-")) return "MS950";
>               if (encname.startsWith("CNS-"))    return "EUC-TW";
>               // Chinese (Simplified)
>               if (encname.startsWith("GB-"))    return "MS936";
>               if (encname.startsWith("GBpc-"))  return "GB2312";
>               if (encname.startsWith("GBK-"))   return "MS936";
>               if (encname.startsWith("GBKp-"))  return "MS936";
>               if (encname.startsWith("GBK2K-")) return "GB18030";
>               // Japanese
>               if (encname.startsWith("83pv-"))  return "JISAutoDetect"; // 
> JIS X 0208 + KanjiTalk6 (漢字6)
>               if (encname.startsWith("90ms-"))  return "JISAutoDetect"; // 
> MS932
>               if (encname.startsWith("90msp-")) return "JISAutoDetect"; // 
> MS932
>               if (encname.startsWith("90pv-"))  return "JISAutoDetect"; // 
> JIS X 0208 + KanjiTalk7 (漢字7)
>               if (encname.startsWith("Add-"))   return "JISAutoDetect"; // 
> JIS X 0208 + Fujitsu FMR
>               if (encname.startsWith("EUC-"))   return "JISAutoDetect"; // 
> JIS X 0208
>               if (encname.startsWith("Ext-"))   return "JISAutoDetect"; // 
> JIS C 6226 + NEC
>               if (encname.equals("H"))          return "JISAutoDetect"; // 
> ISO-2022-JP
>               if (encname.equals("V"))          return "JISAutoDetect"; // 
> ISO-2022-JP
>               // Korean
>               if (encname.startsWith("KSC-"))   return "EUC_KR";
>               if (encname.startsWith("KSCms-")) return "MS949";
>               if (encname.startsWith("KSCpc-")) return "EUC_KR";
>       return null;
>     }
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to