I am trying to retrieve text from the pdf file and I am having difficulties with (I guess) optional content. I placed the screenshot of the file opened in Adobe Pro here:
http://imageshack.us/f/863/samplereport.jpg/ Adobe shows the "hidden" text and also successfully exports the file to "text" format. I am trying to use iTextSharp to do the same, but I can't extract the text that is visible in Adobe. Any help/advise is greatly appreciated. internal class PageReader { public void readPage(String pagepath) { test1(pagepath);// returns non-askii result test2(pagepath);// returns only the last line (I guess it is "direct content. } private void test1(String pagepath) { PdfReader reader = new PdfReader(pagepath); String textres = ""; for (int i = 1; i <= reader.NumberOfPages; ++i) { byte[] lastpage = reader.GetPageContent(1); if (lastpage == null) return; PRTokeniser tokenizer = new PRTokeniser(lastpage); while (tokenizer.NextToken()) if (tokenizer.TokenType == PRTokeniser.TokType.STRING) textres += tokenizer.StringValue; } } private void test2(String pagepath) { PdfReader reader = new PdfReader(pagepath); // String str = PdfTextExtractor.GetTextFromPage(reader, 1); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy = new TextExtractionStrategy(); String str = ""; for (int i = 1; i <= reader.NumberOfPages; ++i) { parser.ProcessContent<TextExtractionStrategy>(i, strategy); str += strategy.txt; } } } internal class TextExtractionStrategy : IRenderListener { public void BeginTextBlock() { } public void EndTextBlock() { } public void RenderImage(ImageRenderInfo renderInfo) { } public void RenderText(TextRenderInfo renderInfo) { _str += renderInfo.GetText(); } private String _str; public String txt { get { return _str; } } } ------------------------------------------------------------------------------ What Every C/C++ and Fortran developer Should Know! Read this article and learn how Intel has extended the reach of its next-generation tools to help Windows* and Linux* C/C++ and Fortran developers boost performance applications - including clusters. http://p.sf.net/sfu/intel-dev2devmay _______________________________________________ iText-questions mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/itext-questions iText(R) is a registered trademark of 1T3XT BVBA. Many questions posted to this list can (and will) be answered with a reference to the iText book: http://www.itextpdf.com/book/ Please check the keywords list before you ask for examples: http://itextpdf.com/themes/keywords.php
