[
https://issues.apache.org/jira/browse/PDFBOX-3132?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15027364#comment-15027364
]
Tilman Hausherr commented on PDFBOX-3132:
-----------------------------------------
However the good news is that your file extracts in 2.0. I'm also adding your
file to my personal test set.
{quote}
趋势科技 | 威胁发现服务
词汇表
Internet安全提示
现今所见的大多数恶意软件威胁使用多种不同的攻击媒介和协议之一入侵受害者的计算机。它们通常利用社会工
程学来引诱、强制或者诱骗受害者运行恶意代码或点击某URL。恶意代码侵入计算机的方式有多种:作为电子邮件
附件、作为即时通讯应用程序中的文件传输请求、直接与其他"免费"软件一起安装,也可伪装成其他软件、文档
或媒体。只要单击电子邮件、Web页面或者即时消息中的恶意URL,就会打开浏览器并导航到特定Web页面,如
(...)
{quote}
> Cannot extract text which font is Type0 with predefined CJK CMap
> ----------------------------------------------------------------
>
> Key: PDFBOX-3132
> URL: https://issues.apache.org/jira/browse/PDFBOX-3132
> Project: PDFBox
> Issue Type: Improvement
> Components: PDModel
> Affects Versions: 1.8.9
> Reporter: Raymond Wu
> Attachments: pdf_font-zhcn.pdf
>
>
> {noformat}
> 20 0 obj
> <<
> /Type /Font
> /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> /Subtype /Type0
> /Encoding /UniGB-UTF16-H
> /DescendantFonts [42 0 R]
> >>
> endobj
> {noformat}
> If the Type0 font is like above, CJK string cannot produced from
> org.apache.pdfbox.pdmodel.font.PDType0Font.
> PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
> So Chinese, Japanese, Korean with this font cannot be extracted.
> I have tried to modify PDType0Font source like this. It works.
> {code:title=PDType0Font.java|borderStyle=solid}
> @Override
> public String encode(byte[] c, int offset, int length) throws IOException
> {
> String retval = null;
> if (hasToUnicode())
> {
> retval = super.encode(c, offset, length);
> }
>
> if (retval == null)
> {
> int result = cmap.lookupCID(c, offset, length);
> if (result != -1)
> {
> retval = descendantFont.cmapEncoding(result, 2, true, null);
> } else {
> // Predefined CJK CMap
> //
> // PDF Source:
> // 20 0 obj
> // <<
> // /Type /Font
> // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
> // /Subtype /Type0
> // /Encoding /UniGB-UTF16-H
> // /DescendantFonts [42 0 R]
> // >>
> // endobj
> //
>
> COSBase encoding = getEncoding();
> if (length == 2 && encoding instanceof COSName)
> {
> String encname = ((COSName)encoding).getName();
> String charset =
> charsetOfPredefinedCJKCMap(encname);
> if (charset!=null) {
> retval = new String(c, offset, length,
> charset);
> }
> }
> }
> }
>
> return retval;
> }
> /**
> * Predefined CJK CMap name to Java charset name
> *
> * @author Raymond Wu <[email protected]>
> * @param encname Predefined CJK CMap name
> * @return Java charset name
> */
> public String charsetOfPredefinedCJKCMap(String encname) {
> // PDF 32000-1:2008 Page 274
> // Table 118 – Predefined CJK CMap names
> //
> // @See
> http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
> // @See
> https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
> // Unicode
> if (encname.contains("UTF16")) return "UTF-16BE";
> if (encname.contains("UCS2")) return "UTF-16BE";
>
> // Chinese (Traditional)
> // @See https://zh.wikipedia.org/wiki/巴別塔
> if (encname.startsWith("B5pc-")) return "BIG5";
> if (encname.startsWith("HKscs-")) return "MS950_HKSCS";
> if (encname.startsWith("ETen-")) return "MS950";
> if (encname.startsWith("ETenms-")) return "MS950";
> if (encname.startsWith("CNS-")) return "EUC-TW";
> // Chinese (Simplified)
> if (encname.startsWith("GB-")) return "MS936";
> if (encname.startsWith("GBpc-")) return "GB2312";
> if (encname.startsWith("GBK-")) return "MS936";
> if (encname.startsWith("GBKp-")) return "MS936";
> if (encname.startsWith("GBK2K-")) return "GB18030";
> // Japanese
> if (encname.startsWith("83pv-")) return "JISAutoDetect"; //
> JIS X 0208 + KanjiTalk6 (漢字6)
> if (encname.startsWith("90ms-")) return "JISAutoDetect"; //
> MS932
> if (encname.startsWith("90msp-")) return "JISAutoDetect"; //
> MS932
> if (encname.startsWith("90pv-")) return "JISAutoDetect"; //
> JIS X 0208 + KanjiTalk7 (漢字7)
> if (encname.startsWith("Add-")) return "JISAutoDetect"; //
> JIS X 0208 + Fujitsu FMR
> if (encname.startsWith("EUC-")) return "JISAutoDetect"; //
> JIS X 0208
> if (encname.startsWith("Ext-")) return "JISAutoDetect"; //
> JIS C 6226 + NEC
> if (encname.equals("H")) return "JISAutoDetect"; //
> ISO-2022-JP
> if (encname.equals("V")) return "JISAutoDetect"; //
> ISO-2022-JP
> // Korean
> if (encname.startsWith("KSC-")) return "EUC_KR";
> if (encname.startsWith("KSCms-")) return "MS949";
> if (encname.startsWith("KSCpc-")) return "EUC_KR";
> return null;
> }
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]