Raymond Wu created PDFBOX-3132:
----------------------------------
Summary: Cannot extract text which font is Type0 with predefined
CJK CMap
Key: PDFBOX-3132
URL: https://issues.apache.org/jira/browse/PDFBOX-3132
Project: PDFBox
Issue Type: Improvement
Components: PDModel
Affects Versions: 1.8.9
Reporter: Raymond Wu
20 0 obj
<<
/Type /Font
/BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
/Subtype /Type0
/Encoding /UniGB-UTF16-H
/DescendantFonts [42 0 R]
>>
endobj
If the Type0 font is like above, CJK string cannot produced from
org.apache.pdfbox.pdmodel.font.PDType0Font.
PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
So Chinese, Japanese, Korean with this font cannot be extracted.
I have tried to modify PDType0Font source like this. It works.
@Override
public String encode(byte[] c, int offset, int length) throws IOException
{
String retval = null;
if (hasToUnicode())
{
retval = super.encode(c, offset, length);
}
if (retval == null)
{
int result = cmap.lookupCID(c, offset, length);
if (result != -1)
{
retval = descendantFont.cmapEncoding(result, 2, true, null);
} else {
// Predefined CJK CMap
//
// PDF Source:
// 20 0 obj
// <<
// /Type /Font
// /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H
// /Subtype /Type0
// /Encoding /UniGB-UTF16-H
// /DescendantFonts [42 0 R]
// >>
// endobj
//
COSBase encoding = getEncoding();
if (length == 2 && encoding instanceof COSName)
{
String encname = ((COSName)encoding).getName();
String charset =
charsetOfPredefinedCJKCMap(encname);
if (charset!=null) {
retval = new String(c, offset, length,
charset);
}
}
}
}
return retval;
}
/**
* Predefined CJK CMap name to Java charset name
*
* @author Raymond Wu <[email protected]>
* @param encname Predefined CJK CMap name
* @return Java charset name
*/
public String charsetOfPredefinedCJKCMap(String encname) {
// PDF 32000-1:2008 Page 274
// Table 118 – Predefined CJK CMap names
//
// @See
http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm
// @See
https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
// Unicode
if (encname.contains("UTF16")) return "UTF-16BE";
if (encname.contains("UCS2")) return "UTF-16BE";
// Chinese (Traditional)
// @See https://zh.wikipedia.org/wiki/巴別塔
if (encname.startsWith("B5pc-")) return "BIG5";
if (encname.startsWith("HKscs-")) return "MS950_HKSCS";
if (encname.startsWith("ETen-")) return "MS950";
if (encname.startsWith("ETenms-")) return "MS950";
if (encname.startsWith("CNS-")) return "EUC-TW";
// Chinese (Simplified)
if (encname.startsWith("GB-")) return "MS936";
if (encname.startsWith("GBpc-")) return "GB2312";
if (encname.startsWith("GBK-")) return "MS936";
if (encname.startsWith("GBKp-")) return "MS936";
if (encname.startsWith("GBK2K-")) return "GB18030";
// Japanese
if (encname.startsWith("83pv-")) return "JISAutoDetect"; //
JIS X 0208 + KanjiTalk6 (漢字6)
if (encname.startsWith("90ms-")) return "JISAutoDetect"; //
MS932
if (encname.startsWith("90msp-")) return "JISAutoDetect"; //
MS932
if (encname.startsWith("90pv-")) return "JISAutoDetect"; //
JIS X 0208 + KanjiTalk7 (漢字7)
if (encname.startsWith("Add-")) return "JISAutoDetect"; //
JIS X 0208 + Fujitsu FMR
if (encname.startsWith("EUC-")) return "JISAutoDetect"; //
JIS X 0208
if (encname.startsWith("Ext-")) return "JISAutoDetect"; //
JIS C 6226 + NEC
if (encname.equals("H")) return "JISAutoDetect"; //
ISO-2022-JP
if (encname.equals("V")) return "JISAutoDetect"; //
ISO-2022-JP
// Korean
if (encname.startsWith("KSC-")) return "EUC_KR";
if (encname.startsWith("KSCms-")) return "MS949";
if (encname.startsWith("KSCpc-")) return "EUC_KR";
return null;
}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]