[ https://issues.apache.org/jira/browse/FOP-2880?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Dan Caprioara updated FOP-2880: ------------------------------- Description: The hyphenated words are rendered by FOP using the hard hyphen character. This contradicts the PDF specification, where in section: 14.8.2.2.3 Incidental Artifacts clearly states that the SHY soft hyphen U+00AD character should be used. The effect is that the hyphenated words are not searchable, and the copy/paste feature includes also the hard hyphens, instead of removing them and joining the words pieces together. Here is a small patch that can be applied on the FOP core project in order to fix this - this is more like a proof of concept, the real fix would be to change the default hyphenation character * in the FOProppertyMapping and to change the font mappings: to remove the replacement of the SHY with the HYPHEN, see the org.apache.fop.fonts.CodePointMapping.encStandardEncoding, and org.apache.fop.fonts.CodePointMapping.encISOLatin1Encoding {code} 0xad, 0x002D, // hyphen 0xad, 0x00AD, // hyphen {code} The patch: {code} Index: src/main/java/org/apache/fop/fo/properties/CommonHyphenation.java =================================================================== --- src/main/java/org/apache/fop/fo/properties/CommonHyphenation.java (revision 191037) +++ src/main/java/org/apache/fop/fo/properties/CommonHyphenation.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -/* $Id$ */ +/* $Id: CommonHyphenation.java 1610839 2014-07-15 20:25:58Z vhennebert $ */ package org.apache.fop.fo.properties; @@ -184,6 +184,16 @@ */ public int getHyphIPD(org.apache.fop.fonts.Font font) { char hyphChar = getHyphChar(font); + + if (hyphChar == '\u00ad') { + // Bizarre fix, defining the SHY as default hyphenation character in the FOPropertyMapping, leads + // to hard hyphens not selectable in the PDF reader. + // + // Mapping also the hard hyphen makes the character selectable! + font.mapChar('\u002d'); + } + + return font.getCharWidth(hyphChar); } Index: src/main/java/org/apache/fop/fo/FOPropertyMapping.java =================================================================== --- src/main/java/org/apache/fop/fo/FOPropertyMapping.java (revision 190759) +++ src/main/java/org/apache/fop/fo/FOPropertyMapping.java (working copy) @@ -1106,7 +1106,10 @@ // hyphenation-character m = new CharacterProperty.Maker(PR_HYPHENATION_CHARACTER); m.setInherited(true); - m.setDefault("-"); +// +// m.setDefault("-"); + m.setDefault("\u00ad"); + addPropertyMaker("hyphenation-character", m); // hyphenation-push-character-count Index: src/main/java/org/apache/fop/render/pdf/PDFPainter.java =================================================================== --- src/main/java/org/apache/fop/render/pdf/PDFPainter.java (revision 190759) +++ src/main/java/org/apache/fop/render/pdf/PDFPainter.java (working copy) @@ -420,7 +420,8 @@ PDFStructElem structElem = (PDFStructElem) getContext().getStructureTreeElement(); languageAvailabilityChecker.checkLanguageAvailability(text); MarkedContentInfo mci = logicalStructureHandler.addTextContentItem(structElem); - String actualText = getContext().isHyphenated() ? text.substring(0, text.length() - 1) : null; +// String actualText = getContext().isHyphenated() ? text.substring(0, text.length() - 1) : null; + String actualText = null; generator.endTextObject(); generator.updateColor(state.getTextColor(), true, null); generator.beginTextObject(mci.tag, mci.mcid, actualText); @@ -490,6 +491,15 @@ float glyphAdjust = 0; if (font.hasCodePoint(orgChar)) { ch = font.mapCodePoint(orgChar); + if (orgChar == '\u00ad' && ch == '\u002d'){ + // Map it back to the SHY, the hard hyphen is not correct, causes the hyphenated words not being searchable. + // See the PDF Spec: 14.8.2.2.3 Incidental Artifacts / Hyphenation paragraph. + + // The ansi encoding CodePointMapping has the hyphenation char with two entries, + // the first is selected, the hard hyphen. Reverting... + ch = orgChar; + } + ch = selectAndMapSingleByteFont(tf, fontName, fontSize, textutil, ch); if ((wordSpacing != 0) && CharUtilities.isAdjustableSpace(orgChar)) { glyphAdjust += wordSpacing; {code} was: The hyphenated words are rendered by FOP using the hard hyphen character. This contradicts the PDF specification, where in section: 14.8.2.2.3 Incidental Artifacts clearly states that the SHY soft hyphen U+00AD character should be used. The effect is that the hyphenated words are not searchable, and the copy/paste feature includes also the hard hyphens, instead of removing them and joining the words pieces together. Here is a small patch that can be applied on the FOP core project in order to fix this: {code} Index: src/main/java/org/apache/fop/fo/FOPropertyMapping.java =================================================================== --- src/main/java/org/apache/fop/fo/FOPropertyMapping.java (revision 190759) +++ src/main/java/org/apache/fop/fo/FOPropertyMapping.java (working copy) @@ -1106,7 +1106,10 @@ // hyphenation-character m = new CharacterProperty.Maker(PR_HYPHENATION_CHARACTER); m.setInherited(true); - m.setDefault("-"); + + // m.setDefault("-"); + m.setDefault("\u00ad"); + addPropertyMaker("hyphenation-character", m); // hyphenation-push-character-count Index: src/main/java/org/apache/fop/render/pdf/PDFPainter.java =================================================================== --- src/main/java/org/apache/fop/render/pdf/PDFPainter.java (revision 190759) +++ src/main/java/org/apache/fop/render/pdf/PDFPainter.java (working copy) @@ -420,7 +420,8 @@ PDFStructElem structElem = (PDFStructElem) getContext().getStructureTreeElement(); languageAvailabilityChecker.checkLanguageAvailability(text); MarkedContentInfo mci = logicalStructureHandler.addTextContentItem(structElem); - String actualText = getContext().isHyphenated() ? text.substring(0, text.length() - 1) : null; +// String actualText = getContext().isHyphenated() ? text.substring(0, text.length() - 1) : null; + String actualText = null; generator.endTextObject(); generator.updateColor(state.getTextColor(), true, null); generator.beginTextObject(mci.tag, mci.mcid, actualText); @@ -490,6 +491,14 @@ float glyphAdjust = 0; if (font.hasCodePoint(orgChar)) { ch = font.mapCodePoint(orgChar); + if (orgChar == '\u00ad'){ + // Map it back to the SHY, the hard hyphen is not correct, causes the hyphenated words not being searchable. + // See the PDF Spec: 14.8.2.2.3 Incidental Artifacts / Hyphenation paragraph. + + // The ansi encoding CodePointMapping has the hyphenation char with two entries, + // the first is selected, the hard hyphen. Reverting... + ch = orgChar; + } ch = selectAndMapSingleByteFont(tf, fontName, fontSize, textutil, ch); if ((wordSpacing != 0) && CharUtilities.isAdjustableSpace(orgChar)) { glyphAdjust += wordSpacing; {code} > [PATCH] Hyphenated words are not searchable in readers > ------------------------------------------------------ > > Key: FOP-2880 > URL: https://issues.apache.org/jira/browse/FOP-2880 > Project: FOP > Issue Type: Bug > Components: unqualified > Affects Versions: 2.3 > Reporter: Dan Caprioara > Priority: Major > Original Estimate: 1h > Remaining Estimate: 1h > > The hyphenated words are rendered by FOP using the hard hyphen character. > This contradicts the PDF specification, where in section: > 14.8.2.2.3 Incidental Artifacts > clearly states that the SHY soft hyphen U+00AD character should be used. > The effect is that the hyphenated words are not searchable, and the > copy/paste feature includes also the hard hyphens, instead of removing them > and joining the words pieces together. > Here is a small patch that can be applied on the FOP core project in order to > fix this - this is more like a proof of concept, the real fix would be to > change the default hyphenation character * in the FOProppertyMapping and to > change the font mappings: to remove the replacement of the SHY with the > HYPHEN, see the org.apache.fop.fonts.CodePointMapping.encStandardEncoding, > and org.apache.fop.fonts.CodePointMapping.encISOLatin1Encoding > {code} > 0xad, 0x002D, // hyphen > 0xad, 0x00AD, // hyphen > {code} > The patch: > {code} > Index: src/main/java/org/apache/fop/fo/properties/CommonHyphenation.java > =================================================================== > --- src/main/java/org/apache/fop/fo/properties/CommonHyphenation.java > (revision 191037) > +++ src/main/java/org/apache/fop/fo/properties/CommonHyphenation.java > (working copy) > @@ -15,7 +15,7 @@ > * limitations under the License. > */ > > -/* $Id$ */ > +/* $Id: CommonHyphenation.java 1610839 2014-07-15 20:25:58Z vhennebert $ */ > > package org.apache.fop.fo.properties; > > @@ -184,6 +184,16 @@ > */ > public int getHyphIPD(org.apache.fop.fonts.Font font) { > char hyphChar = getHyphChar(font); > + > + if (hyphChar == '\u00ad') { > + // Bizarre fix, defining the SHY as default hyphenation character > in the FOPropertyMapping, leads > + // to hard hyphens not selectable in the PDF reader. > + // > + // Mapping also the hard hyphen makes the character selectable! > + font.mapChar('\u002d'); > + } > + > + > return font.getCharWidth(hyphChar); > } > > Index: src/main/java/org/apache/fop/fo/FOPropertyMapping.java > =================================================================== > --- src/main/java/org/apache/fop/fo/FOPropertyMapping.java (revision > 190759) > +++ src/main/java/org/apache/fop/fo/FOPropertyMapping.java (working copy) > @@ -1106,7 +1106,10 @@ > // hyphenation-character > m = new CharacterProperty.Maker(PR_HYPHENATION_CHARACTER); > m.setInherited(true); > - m.setDefault("-"); > +// > +// m.setDefault("-"); > + m.setDefault("\u00ad"); > + > addPropertyMaker("hyphenation-character", m); > > // hyphenation-push-character-count > Index: src/main/java/org/apache/fop/render/pdf/PDFPainter.java > =================================================================== > --- src/main/java/org/apache/fop/render/pdf/PDFPainter.java (revision > 190759) > +++ src/main/java/org/apache/fop/render/pdf/PDFPainter.java (working copy) > @@ -420,7 +420,8 @@ > PDFStructElem structElem = (PDFStructElem) > getContext().getStructureTreeElement(); > languageAvailabilityChecker.checkLanguageAvailability(text); > MarkedContentInfo mci = > logicalStructureHandler.addTextContentItem(structElem); > - String actualText = getContext().isHyphenated() ? > text.substring(0, text.length() - 1) : null; > +// String actualText = getContext().isHyphenated() ? > text.substring(0, text.length() - 1) : null; > + String actualText = null; > generator.endTextObject(); > generator.updateColor(state.getTextColor(), true, null); > generator.beginTextObject(mci.tag, mci.mcid, actualText); > @@ -490,6 +491,15 @@ > float glyphAdjust = 0; > if (font.hasCodePoint(orgChar)) { > ch = font.mapCodePoint(orgChar); > + if (orgChar == > '\u00ad' && ch == '\u002d'){ > + // Map it > back to the SHY, the hard hyphen is not correct, causes the hyphenated words > not being searchable. > + // See the > PDF Spec: 14.8.2.2.3 Incidental Artifacts / Hyphenation paragraph. > + > + // The ansi > encoding CodePointMapping has the hyphenation char with two entries, > + // the first > is selected, the hard hyphen. Reverting... > + ch = orgChar; > + } > + > ch = selectAndMapSingleByteFont(tf, fontName, fontSize, > textutil, ch); > if ((wordSpacing != 0) && > CharUtilities.isAdjustableSpace(orgChar)) { > glyphAdjust += wordSpacing; > {code} -- This message was sent by Atlassian JIRA (v7.6.14#76016)