[
https://issues.apache.org/jira/browse/PDFBOX-3132?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
John Hewson closed PDFBOX-3132.
-------------------------------
Resolution: Won't Fix
We don't support this in 1.8. Use 2.0 instead.
> Cannot extract text which font is Type0 with predefined CJK CMap
> ----------------------------------------------------------------
>
> Key: PDFBOX-3132
> URL: https://issues.apache.org/jira/browse/PDFBOX-3132
> Project: PDFBox
> Issue Type: Improvement
> Components: PDModel
> Affects Versions: 1.8.9
> Reporter: Raymond Wu
> Attachments: pdf_font-zhcn.pdf
>
>
> {noformat}
> 20 0 obj
> <<
> /Type /Font
> /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> /Subtype /Type0
> /Encoding /UniGB-UTF16-H
> /DescendantFonts [42 0 R]
> >>
> endobj
> {noformat}
> If the Type0 font is like above, CJK string cannot produced from
> org.apache.pdfbox.pdmodel.font.PDType0Font.
> PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
> So Chinese, Japanese, Korean with this font cannot be extracted.
> I have tried to modify PDType0Font source like this. It works.
> {code:title=PDType0Font.java|borderStyle=solid}
> @Override
> public String encode(byte[] c, int offset, int length) throws IOException
> {
> String retval = null;
> if (hasToUnicode())
> {
> retval = super.encode(c, offset, length);
> }
>
> if (retval == null)
> {
> int result = cmap.lookupCID(c, offset, length);
> if (result != -1)
> {
> retval = descendantFont.cmapEncoding(result, 2, true, null);
> } else {
> // Predefined CJK CMap
> //
> // PDF Source:
> // 20 0 obj
> // <<
> // /Type /Font
> // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> // /Subtype /Type0
> // /Encoding /UniGB-UTF16-H
> // /DescendantFonts [42 0 R]
> // >>
> // endobj
> //
>
> COSBase encoding = getEncoding();
> if (length == 2 && encoding instanceof COSName)
> {
> String encname = ((COSName)encoding).getName();
> String charset =
> charsetOfPredefinedCJKCMap(encname);
> if (charset!=null) {
> retval = new String(c, offset, length,
> charset);
> }
> }
> }
> }
>
> return retval;
> }
> /**
> * Predefined CJK CMap name to Java charset name
> *
> * @author Raymond Wu <[email protected]>
> * @param encname Predefined CJK CMap name
> * @return Java charset name
> */
> public String charsetOfPredefinedCJKCMap(String encname) {
> // PDF 32000-1:2008 Page 274
> // Table 118 – Predefined CJK CMap names
> //
> // @See
> http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
> // @See
> https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
> // Unicode
> if (encname.contains("UTF16")) return "UTF-16BE";
> if (encname.contains("UCS2")) return "UTF-16BE";
>
> // Chinese (Traditional)
> // @See https://zh.wikipedia.org/wiki/巴別塔
> if (encname.startsWith("B5pc-")) return "BIG5";
> if (encname.startsWith("HKscs-")) return "MS950_HKSCS";
> if (encname.startsWith("ETen-")) return "MS950";
> if (encname.startsWith("ETenms-")) return "MS950";
> if (encname.startsWith("CNS-")) return "EUC-TW";
> // Chinese (Simplified)
> if (encname.startsWith("GB-")) return "MS936";
> if (encname.startsWith("GBpc-")) return "GB2312";
> if (encname.startsWith("GBK-")) return "MS936";
> if (encname.startsWith("GBKp-")) return "MS936";
> if (encname.startsWith("GBK2K-")) return "GB18030";
> // Japanese
> if (encname.startsWith("83pv-")) return "JISAutoDetect"; //
> JIS X 0208 + KanjiTalk6 (漢字6)
> if (encname.startsWith("90ms-")) return "JISAutoDetect"; //
> MS932
> if (encname.startsWith("90msp-")) return "JISAutoDetect"; //
> MS932
> if (encname.startsWith("90pv-")) return "JISAutoDetect"; //
> JIS X 0208 + KanjiTalk7 (漢字7)
> if (encname.startsWith("Add-")) return "JISAutoDetect"; //
> JIS X 0208 + Fujitsu FMR
> if (encname.startsWith("EUC-")) return "JISAutoDetect"; //
> JIS X 0208
> if (encname.startsWith("Ext-")) return "JISAutoDetect"; //
> JIS C 6226 + NEC
> if (encname.equals("H")) return "JISAutoDetect"; //
> ISO-2022-JP
> if (encname.equals("V")) return "JISAutoDetect"; //
> ISO-2022-JP
> // Korean
> if (encname.startsWith("KSC-")) return "EUC_KR";
> if (encname.startsWith("KSCms-")) return "MS949";
> if (encname.startsWith("KSCpc-")) return "EUC_KR";
> return null;
> }
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]