Author: lehmi Date: Thu Sep 2 19:17:26 2010 New Revision: 992066 URL: http://svn.apache.org/viewvc?rev=992066&view=rev Log: PDFBOX-568: improved text extraction of sample_fonts_solidconvertor.pdf and cweb.pdf from our test arena
Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSName.java Thu Sep 2 19:17:26 2010 @@ -835,8 +835,12 @@ public final class COSName extends COSBa /** * A common COSName value. */ + public static final COSName SUPPLEMENT = new COSName( "Supplement" ); + /** + * A common COSName value. + */ public static final COSName SUBTYPE = new COSName( "Subtype" ); - + /** * "T" */ Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/DictionaryEncoding.java Thu Sep 2 19:17:26 2010 @@ -55,8 +55,7 @@ public class DictionaryEncoding extends //for a nonsymbolic font, it is StandardEncoding, and for a symbolic font, it //is the font�s built-in encoding." - //so the default base encoding is standardEncoding - Encoding baseEncoding = new StandardEncoding(); + Encoding baseEncoding = null; COSName baseEncodingName = (COSName)encoding.getDictionaryObject( COSName.BASE_ENCODING ); if( baseEncodingName != null ) @@ -64,6 +63,11 @@ public class DictionaryEncoding extends EncodingManager manager = new EncodingManager(); baseEncoding = manager.getEncoding( baseEncodingName ); } + else + { + //the default base encoding is standardEncoding + baseEncoding = new StandardEncoding(); + } nameToCode.putAll( baseEncoding.nameToCode ); codeToName.putAll( baseEncoding.codeToName ); Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Encoding.java Thu Sep 2 19:17:26 2010 @@ -44,6 +44,7 @@ public abstract class Encoding implement */ private static final Log log = LogFactory.getLog(Encoding.class); + public static final String NOTDEF = ".notdef"; /** * This is a mapping from a character code to a character name. */ @@ -80,7 +81,7 @@ public abstract class Encoding implement } } - NAME_TO_CHARACTER.put( ".notdef", "" ); + NAME_TO_CHARACTER.put( NOTDEF, "" ); NAME_TO_CHARACTER.put( "fi", "fi" ); NAME_TO_CHARACTER.put( "fl", "fl" ); NAME_TO_CHARACTER.put( "ffi", "ffi" ); @@ -188,7 +189,7 @@ public abstract class Encoding implement * @param code The character code that matches the character. * @param name The name of the character. */ - protected void addCharacterEncoding( int code, String name ) + public void addCharacterEncoding( int code, String name ) { codeToName.put( code, name ); nameToCode.put( name, code ); @@ -273,7 +274,7 @@ public abstract class Encoding implement * * @return The printable character for the code. */ - public static String getCharacter( String name ) + public String getCharacter( String name ) { String character = NAME_TO_CHARACTER.get( name ); if( character == null ) @@ -314,33 +315,10 @@ public abstract class Encoding implement character = name; } } - // this encoding is used in pdfs generated with TeX/LateX - else if (name.length() <= 4 && (name.startsWith("x") || name.startsWith("a")) ) + else if (nameToCode.containsKey(name)) { - try - { - int value = Integer.parseInt(name.substring(1), (name.startsWith("x") ? 16 : 10)); - // add some additional mapping for values < 32 and = 127 - if (value >=0 && value <= 9) - { - value += 161; - } - else if (value >= 10 && value < 32) - { - value += 163; - } - else if ( value == 127) - { - value = 196; - } - character = Character.toString((char)value); - NAME_TO_CHARACTER.put(name, character); - } - catch(NumberFormatException exception) - { - log.warn( "Not a number in character name: " + name ); - character = name; - } + int code = nameToCode.get(name); + character = Character.toString((char)code); } else { @@ -349,5 +327,5 @@ public abstract class Encoding implement } return character; } - + } Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java?rev=992066&view=auto ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java (added) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/Type1Encoding.java Thu Sep 2 19:17:26 2010 @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pdfbox.encoding; + +import org.apache.pdfbox.cos.COSBase; + +/** + * This class represents an encoding which was read from a type1 font. + * + */ +public class Type1Encoding extends Encoding +{ + public Type1Encoding(int size) + { + for (int i=1;i<size;i++) + { + addCharacterEncoding(i, NOTDEF); + } + } + + /** + * {...@inheritdoc} + */ + public COSBase getCOSObject() + { + return null; + } + +} Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/encoding/conversion/CMapSubstitution.java Thu Sep 2 19:17:26 2010 @@ -64,6 +64,8 @@ public class CMapSubstitution cmapSubstitutions.put( "UniJIS-UCS2-HW-H", "UniJIS-UCS2-H" ); cmapSubstitutions.put( "Adobe-Japan1-4", "Adobe-Japan1-UCS2"); + cmapSubstitutions.put( "Adobe-Identity-0", "Identity-H"); + cmapSubstitutions.put( "Adobe-Identity-1", "Identity-H"); } /** Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDFont.java Thu Sep 2 19:17:26 2010 @@ -16,17 +16,18 @@ */ package org.apache.pdfbox.pdmodel.font; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.fontbox.afm.AFMParser; import org.apache.fontbox.afm.FontMetric; import org.apache.fontbox.cmap.CMapParser; import org.apache.fontbox.cmap.CMap; -import org.apache.pdfbox.encoding.conversion.EncodingConversionManager; -import org.apache.pdfbox.encoding.conversion.EncodingConverter; import org.apache.pdfbox.encoding.AFMEncoding; import org.apache.pdfbox.encoding.DictionaryEncoding; import org.apache.pdfbox.encoding.Encoding; import org.apache.pdfbox.encoding.EncodingManager; +import org.apache.pdfbox.encoding.Type1Encoding; import org.apache.pdfbox.encoding.conversion.CMapSubstitution; import org.apache.pdfbox.cos.COSArray; @@ -69,6 +70,11 @@ public abstract class PDFont implements { /** + * Log instance. + */ + private static final Log log = LogFactory.getLog(PDFont.class); + + /** * The cos dictionary for this font. */ protected COSDictionary font; @@ -82,10 +88,10 @@ public abstract class PDFont implements * This is only used if this is a font object and it has an encoding and it is * a type0 font with a cmap. */ - private CMap cmap = null; + protected CMap cmap = null; - private static Map<COSName, CMap> cmapObjects = - Collections.synchronizedMap( new HashMap<COSName, CMap>() ); + private static Map<String, CMap> cmapObjects = + Collections.synchronizedMap( new HashMap<String, CMap>() ); /** * The static map of the default Adobe font metrics. @@ -112,11 +118,13 @@ public abstract class PDFont implements return metrics; } + private static String resourceRootCMAP = "org/apache/pdfbox/resources/cmap/"; + private static String resourceRootAFM = "org/apache/pdfbox/resources/afm/"; + private static void addAdobeFontMetric( Map<String, FontMetric> metrics, String name ) { try { - String resource = - "org/apache/pdfbox/resources/afm/" + name + ".afm"; + String resource = resourceRootAFM + name + ".afm"; InputStream afmStream = ResourceLoader.loadResource( resource ); if( afmStream != null ) { @@ -134,11 +142,6 @@ public abstract class PDFont implements } /** - * This will be set if the font has a toUnicode stream. - */ - private boolean hasToUnicode = false; - - /** * This will clear AFM resources that are stored statically. * This is usually not a problem unless you want to reclaim * resources for a long running process. @@ -171,6 +174,125 @@ public abstract class PDFont implements public PDFont( COSDictionary fontDictionary ) { font = fontDictionary; + determineEncoding(); + } + + private void determineEncoding() + { + String cmapName = null; + COSName encodingName = null; + COSBase toUnicode = font.getDictionaryObject( COSName.TO_UNICODE ); + COSBase encoding = getEncodingObject(); + if( toUnicode != null ) + { + if ( toUnicode instanceof COSStream ) + { + try { + parseCmap(null, ((COSStream)toUnicode).getUnfilteredStream(), null); + } + catch(IOException exception) + { + log.error("Error: Could not load embedded CMAP" ); + } + } + else if ( toUnicode instanceof COSName) + { + encodingName = (COSName)toUnicode; + cmap = cmapObjects.get( encodingName.getName() ); + if (cmap == null) + { + cmapName = encodingName.getName(); + } + } + } + if (encoding != null) + { + if (encoding instanceof COSName) + { + if (cmap == null) + { + encodingName = (COSName)encoding; + cmap = cmapObjects.get( encodingName.getName() ); + if (cmap == null) + { + cmapName = encodingName.getName(); + } + } + if (cmap == null && cmapName != null) + { + EncodingManager manager = getEncodingManager(); + try + { + fontEncoding = manager.getEncoding( encodingName ); + } + catch(IOException exception) + { + log.debug("Debug: Could not find encoding for " + encodingName ); + } + } + } + else if (encoding instanceof COSDictionary) + { + try + { + fontEncoding = new DictionaryEncoding((COSDictionary)encoding); + } + catch(IOException exception) + { + log.error("Error: Could not create the DictionaryEncoding" ); + } + } + else if(encoding instanceof COSStream ) + { + if (cmap == null) + { + COSStream encodingStream = (COSStream)encoding; + try + { + parseCmap( null, encodingStream.getUnfilteredStream(), null ); + } + catch(IOException exception) + { + log.error("Error: Could not parse the embedded CMAP" ); + } + } + } + } + COSDictionary cidsysteminfo = (COSDictionary)font.getDictionaryObject(COSName.CIDSYSTEMINFO); + if (cidsysteminfo != null) + { + String ordering = cidsysteminfo.getString(COSName.ORDERING); + String registry = cidsysteminfo.getString(COSName.REGISTRY); + int supplement = cidsysteminfo.getInt(COSName.SUPPLEMENT); + cmapName = registry + "-" + ordering+ "-" + supplement; + cmapName = CMapSubstitution.substituteCMap( cmapName ); + cmap = cmapObjects.get( cmapName ); + } + FontMetric metric = getAFM(); + if( metric != null ) + { + fontEncoding = new AFMEncoding( metric ); + } + + if (cmap == null && cmapName != null) + { + String resourceName = resourceRootCMAP + cmapName; + try { + parseCmap( resourceRootCMAP, ResourceLoader.loadResource( resourceName ), encodingName ); + if( cmap == null && encodingName == null) + { + log.error("Error: Could not parse predefined CMAP file for '" + cmapName + "'" ); + } + } + catch(IOException exception) + { + log.error("Error: Could not find predefined CMAP file for '" + cmapName + "'" ); + } + } +// if (fontEncoding == null) +// { + getEncodingFromFont(); +// } } /** @@ -316,9 +438,8 @@ public abstract class PDFont implements * * @return The afm object from the name. * - * @throws IOException If there is an error getting the AFM object. */ - protected FontMetric getAFM() throws IOException + protected FontMetric getAFM() { if(afm==null){ COSBase baseFont = font.getDictionaryObject( COSName.BASE_FONT ); @@ -379,136 +500,27 @@ public abstract class PDFont implements public String encode( byte[] c, int offset, int length ) throws IOException { String retval = null; - if( isTypeFont() ) + if( cmap != null ) { - if( cmap == null ) + if (length == 1 && cmap.hasOneByteMappings()) { - COSBase toUnicode = font.getDictionaryObject( COSName.TO_UNICODE ); - if( toUnicode instanceof COSStream ) - { - hasToUnicode = true; - parseCmap( null, ((COSStream)toUnicode).getUnfilteredStream(), null ); - } - else - { - COSBase encoding = getEncodingObject(); - if( encoding instanceof COSStream ) - { - COSStream encodingStream = (COSStream)encoding; - parseCmap( null, encodingStream.getUnfilteredStream(), null ); - } - else if( isType0Font() && encoding instanceof COSName ) - { - COSName encodingName = (COSName)encoding; - cmap = cmapObjects.get( encodingName ); - if( cmap == null ) - { - String cmapName = encodingName.getName(); - if (encodingName.getName().equals( COSName.IDENTITY_H.getName() )) - { - COSArray descendantFontArray = - (COSArray)font.getDictionaryObject( COSName.DESCENDANT_FONTS ); - if (descendantFontArray != null) - { - COSDictionary descendantFontDictionary = - (COSDictionary)descendantFontArray.getObject( 0 ); - PDFont descendentFont = PDFontFactory.createFont( descendantFontDictionary ); - COSDictionary cidsysteminfo = - (COSDictionary)descendentFont.font.getDictionaryObject(COSName.CIDSYSTEMINFO); - if (cidsysteminfo != null) - { - String ordering = cidsysteminfo.getString(COSName.ORDERING); - String registry = cidsysteminfo.getString(COSName.REGISTRY); - cmapName = registry + "-" + ordering+"-UCS2"; - } - } - } - else - { - cmapName = CMapSubstitution.substituteCMap( cmapName ); - } - - String resourceRoot = "org/apache/pdfbox/resources/cmap/"; - String resourceName = resourceRoot + cmapName; - parseCmap( resourceRoot, ResourceLoader.loadResource( resourceName ), encodingName ); - if( cmap == null && !encodingName.getName().equals( COSName.IDENTITY_H.getName() ) ) - { - throw new IOException( "Error: Could not find predefined " + - "CMAP file for '" + encodingName.getName() + "'" ); - } - } - } - else if( encoding instanceof COSName || - encoding instanceof COSDictionary ) - { - Encoding currentFontEncoding = getEncoding(); - if( currentFontEncoding != null ) - { - retval = currentFontEncoding.getCharacter( getCodeFromArray( c, offset, length ) ); - } - } - else - { - COSDictionary fontDescriptor = - (COSDictionary)font.getDictionaryObject( COSName.FONT_DESC ); - if( isTrueTypeFont() && fontDescriptor != null && - (fontDescriptor.getDictionaryObject( COSName.FONT_FILE )!= null || - fontDescriptor.getDictionaryObject( COSName.FONT_FILE2 ) != null || - fontDescriptor.getDictionaryObject( COSName.FONT_FILE3 ) != null ) ) - { - //If we are using an embedded font then there is not much we can do besides - //return the same character codes. - //retval = new String( c,offset, length ); - retval = getStringFromArray( c, offset, length ); - } - else - { - //this case will be handled below after checking the cmap - } - } - } - - + retval = cmap.lookup( c, offset, length ); } - } - if( retval == null && cmap != null ) - { - retval = cmap.lookup( c, offset, length ); - } - - COSBase encodingCOS = getEncodingObject(); - // The converter isn't needed if an unicode mapping is already given by the font dictionary - if ( !hasToUnicode && encodingCOS instanceof COSName ) - { - EncodingConverter converter = EncodingConversionManager.getConverter(((COSName)encodingCOS).getName()); - if ( converter != null ) + else if (length == 2 && cmap.hasTwoByteMappings()) { - if ( retval != null ) - { - retval = converter.convertString(retval); - } - else - { - retval = converter.convertBytes(c, offset, length, cmap); - } - return retval; + retval = cmap.lookup( c, offset, length ); } } - //if we havn't found a value yet and - //we are still on the first byte and - //there is no cmap or the cmap does not have 2 byte mappings then try to encode - //using fallback methods. - if( retval == null && - length == 1 && - (cmap == null || !cmap.hasTwoByteMappings())) + // there is no cmap but probably an encoding with a suitable mapping + if( retval == null && length == 1) { Encoding encoding = getEncoding(); if( encoding != null ) { retval = encoding.getCharacter( getCodeFromArray( c, offset, length ) ); } - if( retval == null ) + if( retval == null && cmap == null) { retval = getStringFromArray( c, offset, length ); } @@ -548,16 +560,20 @@ public abstract class PDFont implements return retval; } - private void parseCmap( String cmapRoot, InputStream cmapStream, COSName encodingName ) throws IOException + private void parseCmap( String cmapRoot, InputStream cmapStream, COSName encodingName ) { if( cmapStream != null ) { CMapParser parser = new CMapParser(); - cmap = parser.parse( cmapRoot, cmapStream ); - if( encodingName != null ) + try { - cmapObjects.put( encodingName, cmap ); + cmap = parser.parse( cmapRoot, cmapStream ); + if( encodingName != null ) + { + cmapObjects.put( encodingName.getName(), cmap ); + } } + catch (IOException exception) {} } } @@ -583,61 +599,6 @@ public abstract class PDFont implements */ public Encoding getEncoding() throws IOException { - if( fontEncoding == null ) - { - EncodingManager manager = getEncodingManager(); - COSBase encoding = getEncodingObject(); //font.getDictionaryObject( COSName.ENCODING ); - if( encoding == null ) - { - FontMetric metric = getAFM(); - if( metric != null ) - { - fontEncoding = new AFMEncoding( metric ); - } - if( fontEncoding == null ) - { - fontEncoding = manager.getStandardEncoding(); - } - } - /** - * Si la cl� /Encoding existe dans le dictionnaire fonte il y a deux possibilit�s : - * 1er cas : elle est associ� � une reference contenant un dictionnaire de type encoding. - * Ce dictionnaire PDF est repr�sent� par un DictionaryEncoding. - * If the /Encoding Key does exist in the font dictionary, there are two cases : - * case one : The value associated with /Encoding is a reference to a dictionary. - * This dictionary is represented by an instance of DictionaryEncoding class - */ - else if( encoding instanceof COSDictionary ) - { - COSDictionary encodingDic = (COSDictionary)encoding; - //Let's see if the encoding dictionary has a base encoding - //If it does not then we will attempt to get it from the font - //file - COSName baseEncodingName = (COSName) encodingDic.getDictionaryObject( - COSName.BASE_ENCODING); - //on ajoute une entr�e /BaseEncoding dans /Encoding uniquement si elle en est absente - //if not find in Encoding dictinary target, we try to find it from else where - if( baseEncodingName == null) - { - COSName fontEncodingFromFile = getEncodingFromFont(); - encodingDic.setItem( - COSName.BASE_ENCODING, - fontEncodingFromFile ); - } - fontEncoding = new DictionaryEncoding( encodingDic ); - } - else if( encoding instanceof COSName ) - { - if( !encoding.equals( COSName.IDENTITY_H ) ) - { - fontEncoding = manager.getEncoding( (COSName)encoding ); - } - } - else - { - throw new IOException( "Unexpected encoding type:" + encoding.getClass().getName() ); - } - } return fontEncoding; } @@ -653,7 +614,7 @@ public abstract class PDFont implements // Memorized values to avoid repeated dictionary lookups private String subtype = null; - private boolean type0Font; + private boolean type1Font; private boolean trueTypeFont; private boolean typeFont; @@ -666,16 +627,16 @@ public abstract class PDFont implements { if (subtype == null) { subtype = font.getNameAsString( COSName.SUBTYPE ); - type0Font = "Type0".equals(subtype); + type1Font = "Type1".equals(subtype); trueTypeFont = "TrueType".equals(subtype); - typeFont = type0Font || "Type1".equals(subtype) || trueTypeFont; + typeFont = type1Font || "Type0".equals(subtype) || trueTypeFont; } return subtype; } - private boolean isType0Font() { + private boolean isType1Font() { getSubType(); - return type0Font; + return type1Font; } private boolean isTrueTypeFont() { @@ -799,80 +760,85 @@ public abstract class PDFont implements } /** - * Try to get the encoding for the font and add it to the target - * the target must be an an Encoding Dictionary. + * Tries to get the encoding for the type1 font. * - * added by Christophe Huault : DGBS Strasbourg hua...@free.fr october 2004 - * - * @return The encoding from the font. - * - * @throws IOException If there is an error reading the file. */ - private COSName getEncodingFromFont() throws IOException - { - //This whole section of code needs to be replaced with an actual - //type1 font parser!! - - - COSName retvalue = null; - //recuperer le programme de fonte dans son stream qui doit se trouver - //dans le flux r�f�renc� par � la cl� FileFont lui m�me situ� dans - //le dictionnaire associ� � /FontDescriptor du dictionnaire de type /Font courrant - //get the font program in the stream which should be located in - //the /FileFont Stream object himself in the /FontDescriptior of the current - //font dictionary - COSDictionary fontDescriptor = (COSDictionary) font.getDictionaryObject( - COSName.FONT_DESC); - if( fontDescriptor != null ) - { - COSStream fontFile = (COSStream) fontDescriptor.getDictionaryObject( - COSName.FONT_FILE); - if( fontFile != null ) - { - BufferedReader in = - new BufferedReader(new InputStreamReader(fontFile.getUnfilteredStream())); - /** - * this section parse the FileProgram stream searching for a /Encoding entry - * the research stop if the entry "currentdict end" is reach or after 100 lignes - */ - StringTokenizer st = null; - boolean found = false; - String line = ""; - String key = null; - for( int i = 0; null!=( line = in.readLine() ) && - i < 40 && - !line.equals("currentdict end") - && !found; i++) + private void getEncodingFromFont() + { + // This whole section of code needs to be replaced with an actual type1 font parser!! + // Get the font program from the embedded type font. + if (isType1Font()) { + COSDictionary fontDescriptor = (COSDictionary) font.getDictionaryObject( + COSName.FONT_DESC); + if( fontDescriptor != null ) + { + COSStream fontFile = (COSStream) fontDescriptor.getDictionaryObject( + COSName.FONT_FILE); + if( fontFile != null ) { - st = new StringTokenizer(line); - if( st.hasMoreTokens() ) + try { - key = st.nextToken(); - if(key.equals("/Encoding") && st.hasMoreTokens() ) + BufferedReader in = + new BufferedReader(new InputStreamReader(fontFile.getUnfilteredStream())); + + // this section parses the font program stream searching for a /Encoding entry + // if it contains an array of values a Type1Encoding will be returned + // if it encoding contains an encoding name the corresponding Encoding will be returned + String line = ""; + Type1Encoding encoding = null; + while( (line = in.readLine()) != null) { - COSName value = COSName.getPDFName( st.nextToken() ); - found = true; - if( value.equals( COSName.MAC_ROMAN_ENCODING ) || - value.equals( COSName.PDF_DOC_ENCODING ) || - value.equals( COSName.STANDARD_ENCODING ) || - value.equals( COSName.WIN_ANSI_ENCODING ) ) + if (line.startsWith("currentdict end")) { + if (encoding != null) + fontEncoding = encoding; + break; + } + if (line.startsWith("/Encoding")) { - //value is expected to be one of the encodings - //ie. StandardEncoding,WinAnsiEncoding,MacRomanEncoding,PDFDocEncoding - retvalue = value; + if(line.endsWith("array")) + { + StringTokenizer st = new StringTokenizer(line); + // ignore the first token + st.nextElement(); + int arraySize = Integer.parseInt(st.nextToken()); + encoding = new Type1Encoding(arraySize); + } + // if there is already an encoding, we don't need to + // assign another one + else if (fontEncoding == null) + { + StringTokenizer st = new StringTokenizer(line); + // ignore the first token + st.nextElement(); + String type1Encoding = st.nextToken(); + fontEncoding = getEncodingManager().getEncoding(COSName.getPDFName(type1Encoding)); + break; + } + } + else if (line.startsWith("dup")) { + StringTokenizer st = new StringTokenizer(line); + // ignore the first token + st.nextElement(); + int index = Integer.parseInt(st.nextToken()); + String name = st.nextToken(); + encoding.addCharacterEncoding(index, name.replace("/", "")); } } + in.close(); + } + catch(IOException exception) + { + log.error("Error: Could not extract the encoding from the embedded type1 font."); } } } } - return retvalue; } /** - * This will get the fonts bouding box. + * This will get the fonts bounding box. * - * @return The fonts bouding box. + * @return The fonts bounding box. * * @throws IOException If there is an error getting the bounding box. */ Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java (original) +++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java Thu Sep 2 19:17:26 2010 @@ -307,14 +307,7 @@ public class TestTextStripper extends Te } if (!stringsEqual(expectedLine, actualLine)) { - // PDFBOX-568: testextract failure on Linux and Mac OS X - // Don't flag a test failure that we already know about. - // TODO: Remove this check once PDFBOX-568 is fixed. - if (!"sample_fonts_solidconvertor.pdf".equals(inFile.getName())) - { - this.bFail = true; - } - + this.bFail = true; log.error("FAILURE: Line mismatch for file " + inFile.getName() + " ( sort = "+bSort+")" + " at expected line: " + expectedReader.getLineNumber() + Modified: pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== Binary files - no diff available. Modified: pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== Binary files - no diff available. Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf-sorted.txt?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== Binary files - no diff available. Modified: pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/sample_fonts_solidconvertor.pdf.txt?rev=992066&r1=992065&r2=992066&view=diff ============================================================================== Binary files - no diff available.