[iText-questions] Images extract problem

dpreznik Thu, 09 Dec 2010 00:38:49 -0800

Dear experts,

I am trying to use iTextSharp to extract images from pdf. Some images are
extracted fine, but many are extracted as negatives of the original images.
Could you please help me with it?
Here is the code I am using:
        public static void ExtractImagesFromPDF(string sourcePdf, string
outputPath)
        {
            PdfReader pdf = new PdfReader(sourcePdf);


            try
            {
                for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages;
pageNumber++)
                {
                    PdfDictionary pg = pdf.GetPageN(pageNumber);

                    var pageImages = new List<byte[]>();

                    getAllImages(pg, pageImages, pdf);

                    if ((pageImages != null))
                    {
                        int imgNum = 1;
                        foreach (byte[] bytes in pageImages)
                        {
                            using (System.IO.MemoryStream memStream = new
System.IO.MemoryStream(bytes))
                            {
                                memStream.Position = 0;
                                System.Drawing.Image img =
System.Drawing.Image.FromStream(memStream);
                                // must save the file while stream is open.
                                if (!System.IO.Directory.Exists(outputPath))
                                   
System.IO.Directory.CreateDirectory(outputPath);

                                string path =
System.IO.Path.Combine(outputPath, String.Format(@"Page_{0}_Image_{1}.png",
pageNumber, imgNum));
                                System.Drawing.Imaging.EncoderParameters
parms = new System.Drawing.Imaging.EncoderParameters(1);
                                parms.Param[0] = new
System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression,
0);
                                // GetImageEncoder is found below this
method
                                System.Drawing.Imaging.ImageCodecInfo
jpegEncoder = GetImageEncoder("PNG");
                                img.Save(path, jpegEncoder, parms);
                            }
                            imgNum++;
                        }
                    }
                }
            }
            catch
            {
                throw;
            }
            finally
            {
                pdf.Close();
            }
        }

        public static System.Drawing.Imaging.ImageCodecInfo
GetImageEncoder(string imageType)
        {
            imageType = imageType.ToUpperInvariant();

            foreach (ImageCodecInfo info in
ImageCodecInfo.GetImageEncoders())
            {
                if (info.FormatDescription == imageType)
                {
                    return info;
                }
            }
            return null;
        }

        private static void getAllImages(PdfDictionary dict, List<byte[]>
images, PdfReader doc)
        {
                PdfDictionary res =
(PdfDictionary)PdfReader.GetPdfObject(dict.Get(PdfName.RESOURCES));
                PdfDictionary xobj =
(PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));

                if (xobj != null)
            {
                        foreach (PdfName name in xobj.Keys)
                {
                                PdfObject obj = xobj.Get(name);
                                if ((obj.IsIndirect()))
                    {
                                        PdfDictionary tg =
(PdfDictionary)PdfReader.GetPdfObject(obj);
                                        PdfName subtype =
(PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
                                        if (PdfName.IMAGE.Equals(subtype))
                        {
                                                int xrefIdx =
((PRIndirectReference)obj).Number;
                                                PdfObject pdfObj =
doc.GetPdfObject(xrefIdx);
                                                PdfStream str =
(PdfStream)pdfObj;
                                                byte[] bytes =
PdfReader.GetStreamBytesRaw((PRStream)str);

                                                string filter =
tg.Get(PdfName.FILTER).ToString();
                                                string width =
tg.Get(PdfName.WIDTH).ToString();
                                                string height =
tg.Get(PdfName.HEIGHT).ToString();
                                                string bpp =
tg.Get(PdfName.BITSPERCOMPONENT).ToString();

                                                if (filter ==
"/FlateDecode")
                            {
                                                        bytes =
PdfReader.FlateDecode(bytes, true);
                                                       
System.Drawing.Imaging.PixelFormat pixelFormat =
default(System.Drawing.Imaging.PixelFormat);
                                                        switch
(int.Parse(bpp))
                                {
                                                                case 1:
                                                                       
pixelFormat = System.Drawing.Imaging.PixelFormat.Format1bppIndexed;
                                                                       
break;
                                    case 8:
                                        pixelFormat =
System.Drawing.Imaging.PixelFormat.Format8bppIndexed;
                                        break;
                                    case 16:
                                        pixelFormat =
System.Drawing.Imaging.PixelFormat.Format16bppArgb1555;
                                        break;
                                                                case 24:
                                                                       
pixelFormat = System.Drawing.Imaging.PixelFormat.Format24bppRgb;
                                                                       
break;
                                    case 48:
                                        pixelFormat =
System.Drawing.Imaging.PixelFormat.Format48bppRgb;
                                        break;
                                                                default:
                                                                       
throw new Exception("Unknown pixel format " + bpp);
                                                        }
                                                       
System.Drawing.Bitmap bmp = new System.Drawing.Bitmap(Int32.Parse(width),
Int32.Parse(height), pixelFormat);
                                                       
System.Drawing.Imaging.BitmapData bmd = bmp.LockBits(new
System.Drawing.Rectangle(0, 0, Int32.Parse(width), Int32.Parse(height)),
System.Drawing.Imaging.ImageLockMode.WriteOnly, pixelFormat);
                               
System.Runtime.InteropServices.Marshal.Copy(bytes, 0, bmd.Scan0,
bytes.Length);
                                                        bmp.UnlockBits(bmd);
                                using (var ms = new
System.IO.MemoryStream())
                                {
                                                                bmp.Save(ms,
System.Drawing.Imaging.ImageFormat.Png);
                                                                bytes =
ms.GetBuffer();
                                                        }
                                                }
                                                images.Add(bytes);
                                        }
                        else if (PdfName.FORM.Equals(subtype) |
PdfName.GROUP.Equals(subtype))
                        {
                                                getAllImages(tg, images,
doc);
                                        }
                                }
                        }
                }
        } 
-- 
View this message in context: 
http://itext-general.2136553.n4.nabble.com/Images-extract-problem-tp3079140p3079140.html
Sent from the iText - General mailing list archive at Nabble.com.

------------------------------------------------------------------------------
This SF Dev2Dev email is sponsored by:

WikiLeaks The End of the Free Internet
http://p.sf.net/sfu/therealnews-com
_______________________________________________
iText-questions mailing list
iText-questions@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/itext-questions

Many questions posted to this list can (and will) be answered with a reference 
to the iText book: http://www.itextpdf.com/book/
Please check the keywords list before you ask for examples: 
http://itextpdf.com/themes/keywords.php

[iText-questions] Images extract problem

Reply via email to