[
https://issues.apache.org/jira/browse/PDFBOX-1507?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13575655#comment-13575655
]
Tanmay Mandal commented on PDFBOX-1507:
---------------------------------------
Hello Andreas,
It it's happening for all PDF no matter what kind of PDF it is,
I think i should tell you some thing , i am coding on .NET so i used
ikvm-7.2.4630.5 conversion to get the jar file to exe and using it in my
console application, i can attached project but did not find any option to
browse and upload, here is my console app code
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.exceptions;
using org.apache.pdfbox.util;
using java.lang;
namespace org.apache.pdfbox.examples.util
{
using InvalidPasswordException =
org.apache.pdfbox.exceptions.InvalidPasswordException;
using PDDocument = org.apache.pdfbox.pdmodel.PDDocument;
using PDPage = org.apache.pdfbox.pdmodel.PDPage;
using PDStream = org.apache.pdfbox.pdmodel.common.PDStream;
using PDFTextStripper = org.apache.pdfbox.util.PDFTextStripper;
using TextPosition = org.apache.pdfbox.util.TextPosition;
using System.IO;
/// <summary>
/// This is an example on how to get some x/y coordinates of text.
///
/// Usage: java org.apache.pdfbox.examples.util.PrintWordLocations
<input-pdf>
///
/// @author <a href="mailto:[email protected]">Ben Litchfield</a>
/// @version $Revision: 1.7 $
/// </summary>
//public class PrintWordLocations : org.apache.pdfbox.util.PDFTextStripper
public class PrintWordLocations : org.apache.pdfbox.util.PDFTextStripper
{
public class WordBox
{
private readonly PrintWordLocations outerInstance;
public float _xmin;
public float _ymin;
public float _fontsize;
public float _xscale;
public float _yscale;
public float _height;
public float _width;
public WordBox(PrintWordLocations outerInstance, TextPosition text)
{
this.outerInstance = outerInstance;
_xmin = text.getXDirAdj();
_ymin = text.getYDirAdj();
_fontsize = text.getFontSize();
_xscale = text.getXScale();
_yscale = text.getYScale();
_height = text.getHeightDir();
_width = text.getWidthDirAdj();
}
public virtual bool rejects(TextPosition text)
{
return (text.getXDirAdj() < _xmin) || (text.getYDirAdj() +
text.getWidthOfSpace() < _ymin);
}
public virtual bool accepts(TextPosition text)
{
return !rejects(text);
}
public virtual void extendBy(TextPosition text)
{
float current_xmin = _xmin;
float current_xmax = _xmin + _width;
float current_ymin = _ymin;
float current_ymax = _ymin + _height;
float text_xmin = text.getXDirAdj();
float text_xmax = text_xmin + text.getWidthDirAdj();
float text_ymin = text.getYDirAdj();
float text_ymax = text_ymin + text.getHeightDir();
float new_xmin = java.lang.Math.min(current_xmin, text_xmin);
float new_xmax = java.lang.Math.max(current_xmax, text_xmax);
float new_ymin = java.lang.Math.min(current_ymin, text_ymin);
float new_ymax = java.lang.Math.max(current_ymax, text_ymax);
_xmin = new_xmin;
_width = new_xmax - new_xmin;
_ymin = new_ymin;
_height = new_ymax - new_ymin;
}
}
protected internal java.lang.StringBuilder word = new
java.lang.StringBuilder("");
protected internal char? last_character = new char?('\0');
protected internal LinkedList<WordBox> box_list = new
LinkedList<WordBox>();
/// <summary>
/// Default constructor.
/// </summary>
/// <exception cref="IOException"> If there is an error loading text
stripper properties. </exception>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not
available in .NET:
//ORIGINAL LINE: public PrintWordLocations() throws java.io.IOException
public PrintWordLocations()
{
try
{
base.setSortByPosition(true);
}
catch (System.Exception ex)
{
Console.Error.WriteLine(ex.ToString());
}
//base.SortByPosition = true;
}
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not
available in .NET:
//ORIGINAL LINE: public void processDocuments(String[] args) throws
Exception
public virtual void processDocuments(string[] args)
{
if (args.Length != 0)
{
usage();
}
else
{
PDDocument document = null;
string InputFilePath = @"D:\Projects\PDF2Alto\Sample
Project\Pdf2Text\ConsoleApplication1\temp\fmp000000426_0541.pdf";
string OutputFilePath = @"D:\Projects\PDF2Alto\Sample
Project\Pdf2Text\ConsoleApplication1\temp\test.txt";
TextWriter tsw = new StreamWriter(OutputFilePath,true);
try
{
//document = PDDocument.load(args[0]);
document = PDDocument.load(InputFilePath);
if (document.isEncrypted())
{
try
{
document.decrypt("");
}
catch (InvalidPasswordException e)
{
Console.Error.WriteLine("Error: Document is
encrypted with a password.");
Environment.Exit(1);
}
}
PrintWordLocations printer = new PrintWordLocations();
//PrintTextLocations printer = new PrintTextLocations();
//IList allPages = document.getDocumentCatalog.AllPages;
IList allPages =
document.getDocumentCatalog().getAllPages().toArray();
//Console.WriteLine("<?xml version=\"1.0\"
encoding=\"UTF-8\"?><alto
xmlns=\"http://www.loc.gov/standards/alto/alto-v2.0.xsd\"><Description><MeasurementUnit>inch1200</MeasurementUnit></Description><Layout>");
tsw.WriteLine("<?xml version=\"1.0\"
encoding=\"UTF-8\"?><alto
xmlns=\"http://www.loc.gov/standards/alto/alto-v2.0.xsd\"><Description><MeasurementUnit>inch1200</MeasurementUnit></Description><Layout>");
for (int i = 0; i < allPages.Count; i++)
{
PDPage page = (PDPage)allPages[i];
//Console.WriteLine("<Page>");
//Console.WriteLine("<PrintSpace>");
//Console.WriteLine("<TextBlock>");
//Console.WriteLine("<TextLine>");
tsw.WriteLine("<Page>");
tsw.WriteLine("<PrintSpace>");
tsw.WriteLine("<TextBlock>");
tsw.WriteLine("<TextLine>");
PDFStreamEngine engine1 = new PDFStreamEngine();
PDStream contents = page.getContents();
if (contents != null)
{
//printer.processStream(page, page.findResources(),
page.Contents.Stream);
// engine1.processStream(page, page.findResources(),
page.getContents().getStream());
printer.processStream(page, page.findResources(),
page.getContents().getStream());
// printer.processTextPosition(page.gettext);
}
endOfPage(tsw);
//Console.WriteLine("</TextLine>");
//Console.WriteLine("</TextBlock>");
//Console.WriteLine("</PrintSpace>");
//Console.WriteLine("</Page>");
tsw.WriteLine("</TextLine>");
tsw.WriteLine("</TextBlock>");
tsw.WriteLine("</PrintSpace>");
tsw.WriteLine("</Page>");
}
tsw.WriteLine("</Layout></alto>");
}
finally
{
if (document != null)
{
document.close();
}
}
}
}
/// <summary>
/// This will print the documents data.
/// </summary>
/// <param name="args"> The command line arguments.
/// </param>
/// <exception cref="Exception"> If there is an error parsing the
document. </exception>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not
available in .NET:
//ORIGINAL LINE: public static void main(String[] args) throws Exception
[STAThread]
public static void Main(string[] args)
{
PrintWordLocations handler = new PrintWordLocations();
handler.processDocuments(args);
}
/// <summary>
/// A method provided as an event interface to allow a subclass to
perform
/// some specific functionality when text needs to be processed.
/// </summary>
/// <param name="text"> The text to be processed </param>
protected internal virtual void processTextPosition(TextPosition text)
{
char current_character = text.getCharacter().ToLower()[0];
if (endsWord(current_character))
{
emitWordBoxes();
}
else
{
if (box_list.Count == 0)
{
if (isAlnumOrApostrophe(current_character))
{
word = word.append(current_character);
}
box_list.AddFirst(new WordBox(this, text));
}
else if (box_list.Last.Value.accepts(text))
{
if (isAlnumOrApostrophe(current_character))
{
word = word.append(current_character);
}
box_list.Last.Value.extendBy(text);
}
else
{
if (!isHyphen(last_character))
{
emitWordBoxes();
}
if (isAlnumOrApostrophe(current_character))
{
word = word.append(current_character);
}
box_list.AddFirst(new WordBox(this, text));
}
}
last_character = current_character;
}
protected internal virtual void endOfPage(TextWriter tsw)
{
//if (box_list.Count > 0)
//{
// emitWordBoxes(tsw);
//}
emitWordBoxes(tsw);
}
protected internal virtual void emitWordBoxes()
{
float pointsToInch1200 = (float)16.6666;
float mysteryHeightScale = (float)1.5;
float height;
float width;
float hpos;
float vpos;
if (word.ToString().Trim().Length > 0)
{
foreach (WordBox wordbox in box_list)
{
width = wordbox._width * pointsToInch1200;
height = wordbox._height * pointsToInch1200 *
mysteryHeightScale;
hpos = wordbox._xmin * pointsToInch1200;
vpos = wordbox._ymin * pointsToInch1200 - height;
Console.WriteLine("<String HEIGHT=\"" + height + "\"
WIDTH=\"" + width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\""
+ word.ToString().Trim() + "\"/>");
//tsw.Write("<String HEIGHT=\"" + height + "\" WIDTH=\"" +
width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\"" +
word.ToString().Trim() + "\"/>");
}
}
//word = new StringBuilder("");
word = new java.lang.StringBuilder("");
last_character = new char?('\0');
box_list.Clear();
}
protected internal virtual void emitWordBoxes(TextWriter tsw)
{
float pointsToInch1200 = (float)16.6666;
float mysteryHeightScale = (float)1.5;
float height;
float width;
float hpos;
float vpos;
if (word.ToString().Trim().Length > 0)
{
foreach (WordBox wordbox in box_list)
{
width = wordbox._width * pointsToInch1200;
height = wordbox._height * pointsToInch1200 *
mysteryHeightScale;
hpos = wordbox._xmin * pointsToInch1200;
vpos = wordbox._ymin * pointsToInch1200 - height;
//Console.WriteLine("<String HEIGHT=\"" + height + "\"
WIDTH=\"" + width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\""
+ word.ToString().Trim() + "\"/>");
tsw.Write("<String HEIGHT=\"" + height + "\" WIDTH=\"" +
width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\"" +
word.ToString().Trim() + "\"/>");
}
}
//word = new StringBuilder("");
word = new java.lang.StringBuilder("");
last_character = new char?('\0');
box_list.Clear();
}
protected internal virtual bool endsWord(char ch)
{
return !(isAlnumOrApostrophe(ch) || isHyphen(ch));
}
protected internal virtual bool isAlnumOrApostrophe(char ch)
{
return char.IsLetterOrDigit(ch) || (ch == '\'');
}
protected internal virtual bool isHyphen(char? ch)
{
return ch == '-';
}
/// <summary>
/// This will print the usage for this document.
/// </summary>
private static void usage()
{
Console.Error.WriteLine("Usage: java
org.apache.pdfbox.examples.pdmodel.PrintWordLocations <input-pdf>");
}
}
}
------------------------------------------------------------------------------------------------------------------------------------------------------
problem is it is not calling "processTextPosition" and C# said it's hiding
inherited member , that's why i tried with "new" also , but as i found it is
not calling this function/delegate [C#] at all and before that getting issue of
errors .
I have created a pdf with only "This is a Test" from MS-Word and getting same
issue same kind of error , if pdf has lot's of text it's get lot's error ,
that's indicate it's getting null reference exception on word/char, might be
one when it is fetching word or char , it is getting null, in other word it can
not ready though recognize [as error are coming], i found due to font some one
get this kind of issue , not sue what is my case.
thanks and regards
Tanmay Mandal
> Getting Issue at text reading
> ------------------------------
>
> Key: PDFBOX-1507
> URL: https://issues.apache.org/jira/browse/PDFBOX-1507
> Project: PDFBox
> Issue Type: Bug
> Components: Parsing
> Affects Versions: 1.7.1
> Environment: windows, runing pdfbox in .Net using ikvm-7.2.4630.5
> conversion , we are actually converting pdf into ALTO file
> Reporter: Tanmay Mandal
> Original Estimate: 1h
> Remaining Estimate: 1h
>
> <?xml version="1.0" encoding="UTF-8"?><alto
> xmlns="http://www.loc.gov/standards/
> alto/alto-v2.0.xsd"><Description><MeasurementUnit>inch1200</MeasurementUnit></De
> scription><Layout>
> <Page>
> <PrintSpace>
> <TextBlock>
> <TextLine>
> Feb 04, 2013 8:40:03 PM org.apache.pdfbox.util.PDFStreamEngine processOperator
> WARNING: java.lang.NullPointerException
> java.lang.NullPointerException
> at
> org.apache.pdfbox.util.PDFTextStripper.processTextPosition(PDFTextStr
> ipper.java:954)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEn
> gine.java:498)
> at
> org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.j
> ava:62)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngin
> e.java:556)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:271)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:237)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.
> java:218)
> at
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.processDocumen
> ts(PrintWordLocation.cs:185)
> at
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.Main(PrintWord
> Location.cs:228)
> at cli.System.AppDomain._nExecuteAssembly(Unknown Source)
> at cli.System.AppDomain.ExecuteAssembly(Unknown Source)
> at
> cli.Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly(U
> nknown Source)
> Feb 04, 2013 8:40:03 PM org.apache.pdfbox.util.PDFStreamEngine processOperator
> WARNING: java.lang.NullPointerException
> java.lang.NullPointerException
> at
> org.apache.pdfbox.util.PDFTextStripper.processTextPosition(PDFTextStr
> ipper.java:954)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEn
> gine.java:498)
> at
> org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.j
> ava:62)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngin
> e.java:556)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:271)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:237)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.
> java:218)
> at
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.processDocumen
> ts(PrintWordLocation.cs:185)
> at
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.Main(PrintWord
> Location.cs:228)
> at cli.System.AppDomain._nExecuteAssembly(Unknown Source)
> at cli.System.AppDomain.ExecuteAssembly(Unknown Source)
> at
> cli.Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly(U
> nknown Source)
> Feb 04, 2013 8:40:03 PM org.apache.pdfbox.util.PDFStreamEngine processOperator
> WARNING: java.lang.NullPointerException
> java.lang.NullPointerException
> at
> org.apache.pdfbox.util.PDFTextStripper.processTextPosition(PDFTextStr
> ipper.java:954)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEn
> gine.java:498)
> at
> org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.j
> ava:62)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngin
> e.java:556)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:271)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:237)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.
> java:218)
> at
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.processDocumen
> ts(PrintWordLocation.cs:185)
> at
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.Main(PrintWord
> Location.cs:228)
> at cli.System.AppDomain._nExecuteAssembly(Unknown Source)
> at cli.System.AppDomain.ExecuteAssembly(Unknown Source)
> at
> cli.Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly(U
> nknown Source)
> Feb 04, 2013 8:40:03 PM org.apache.pdfbox.util.PDFStreamEngine processOperator
> WARNING: java.lang.NullPointerException
> java.lang.NullPointerException
> at
> org.apache.pdfbox.util.PDFTextStripper.processTextPosition(PDFTextStr
> ipper.java:954)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEn
> gine.java:498)
> at
> org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.j
> ava:62)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngin
> e.java:556)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:271)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:237)
> at
> org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.
> java:218)
> at
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.processDocumen
> ts(PrintWordLocation.cs:185)
> at
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.Main(PrintWord
> Location.cs:228)
> at cli.System.AppDomain._nExecuteAssembly(Unknown Source)
> at cli.System.AppDomain.ExecuteAssembly(Unknown Source)
> at
> cli.Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly(U
> nknown Source)
> </TextLine>
> </TextBlock>
> </PrintSpace>
> </Page>
> We have converted Java code in C# from https://github.com/cokernel/pdf2alto
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira