I am trying to retrieve text from the pdf file and I am having difficulties 
with (I guess) optional content. I placed the screenshot of the file opened in 
Adobe Pro here:

http://imageshack.us/f/863/samplereport.jpg/

Adobe shows the "hidden" text and also successfully exports the file to "text" 
format. I am trying to use iTextSharp to do the same, but I can't extract the 
text that is visible in Adobe. 

Any help/advise is greatly appreciated. 

    internal class PageReader
    {
        public void readPage(String pagepath)
        {
            test1(pagepath);// returns non-askii result
            test2(pagepath);// returns only the last line (I guess it 
is "direct content.
        }

        private void test1(String pagepath)
        {
            PdfReader reader = new PdfReader(pagepath);
            String textres = "";
            for (int i = 1; i <= reader.NumberOfPages; ++i)
            {
                byte[] lastpage = reader.GetPageContent(1);
                if (lastpage == null)
                    return;

                PRTokeniser tokenizer = new PRTokeniser(lastpage);
                while (tokenizer.NextToken())
                    if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
                        textres += tokenizer.StringValue;
            }
        }

        private void test2(String pagepath)
        {
             PdfReader reader = new PdfReader(pagepath);
//             String str = PdfTextExtractor.GetTextFromPage(reader, 1);

            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            TextExtractionStrategy strategy = new TextExtractionStrategy();
            String str = "";
            for (int i = 1; i <= reader.NumberOfPages; ++i)
            {
                parser.ProcessContent<TextExtractionStrategy>(i, strategy);
                str += strategy.txt;
            }
        }
    }

    internal class TextExtractionStrategy : IRenderListener
    {
        public void BeginTextBlock() { }
        public void EndTextBlock() { }
        public void RenderImage(ImageRenderInfo renderInfo) { }
        public void RenderText(TextRenderInfo renderInfo)
        {
            _str += renderInfo.GetText();
        }
        private String _str;

        public String txt { get { return _str; } }
    }



------------------------------------------------------------------------------
What Every C/C++ and Fortran developer Should Know!
Read this article and learn how Intel has extended the reach of its 
next-generation tools to help Windows* and Linux* C/C++ and Fortran 
developers boost performance applications - including clusters. 
http://p.sf.net/sfu/intel-dev2devmay
_______________________________________________
iText-questions mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/itext-questions

iText(R) is a registered trademark of 1T3XT BVBA.
Many questions posted to this list can (and will) be answered with a reference 
to the iText book: http://www.itextpdf.com/book/
Please check the keywords list before you ask for examples: 
http://itextpdf.com/themes/keywords.php

Reply via email to