[
https://issues.apache.org/jira/browse/PDFBOX-465?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
[email protected] updated PDFBOX-465:
-----------------------------------------
Attachment: SimpleDateParsingTest.java
The POTENTIAL_FORMATS array can have a few date formats added to resolve
2 reported issue and 1 issue that we hit, but didn't report because they were
already reported by other bugs.
I tested these date formats, the test program is below, can you can add them to
the Tika code?
Please update : org.apache.pdfbox.util.DateConverter.java to include the
additional format.
Thanks,
Peter Lenahan
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
public class SimpleDateParsingTest {
public SimpleDateParsingTest() {
}
private static final SimpleDateFormat[] POTENTIAL_FORMATS = new
SimpleDateFormat[] {
new SimpleDateFormat("EEEE, dd MMM yyyy hh:mm:ss a"),
new SimpleDateFormat("EEEE, MMM dd, yyyy hh:mm:ss a"),
new SimpleDateFormat("MM/dd/yyyy hh:mm:ss"),
new SimpleDateFormat("MM/dd/yyyy"),
new SimpleDateFormat("yyyymmdd hh:mm:ss Z"), //
new SimpleDateFormat("yyyymmdd hh:mm:ss"), //
new SimpleDateFormat("yyyymmdd hh:mm:ss Z"), //
new SimpleDateFormat("yyyymmdd'+00''00'''"), // There is a
bug in parsing time zones
new SimpleDateFormat("yyyymmdd'+01''00'''"), // when there is
no time attached.
new SimpleDateFormat("yyyymmdd'+02''00'''"), // this is a
known java bug.
new SimpleDateFormat("yyyymmdd'+03''00'''"), //
new SimpleDateFormat("yyyymmdd'+04''00'''"), //
new SimpleDateFormat("yyyymmdd'+05''00'''"), //
new SimpleDateFormat("yyyymmdd'+06''00'''"), //
new SimpleDateFormat("yyyymmdd'+07''00'''"), //
new SimpleDateFormat("yyyymmdd'+08''00'''"), //
new SimpleDateFormat("yyyymmdd'+09''00'''"), //
new SimpleDateFormat("yyyymmdd'+10''00'''"), //
new SimpleDateFormat("yyyymmdd'+11''00'''"), //
new SimpleDateFormat("yyyymmdd'+12''00'''"), //
new SimpleDateFormat("yyyymmdd'-01''00'''"), //
new SimpleDateFormat("yyyymmdd'-02''00'''"), //
new SimpleDateFormat("yyyymmdd'-03''00'''"), //
new SimpleDateFormat("yyyymmdd'-04''00'''"), //
new SimpleDateFormat("yyyymmdd'-05''00'''"), //
new SimpleDateFormat("yyyymmdd'-06''00'''"), //
new SimpleDateFormat("yyyymmdd'-07''00'''"), //
new SimpleDateFormat("yyyymmdd'-08''00'''"), //
new SimpleDateFormat("yyyymmdd'-09''00'''"), //
new SimpleDateFormat("yyyymmdd'-10''00'''"), //
new SimpleDateFormat("yyyymmdd'-11''00'''"), //
new SimpleDateFormat("yyyymmdd'-12''00'''"), //
new SimpleDateFormat("yyyymmdd"), // for 20090401+0200
//I think you want to add this:
//You should also add the format that is described in the bug
report because that may also happen.
new SimpleDateFormat("dd MMM yyyy hh:mm:ss"), // for 26 May
2000 11:25:00
new SimpleDateFormat("dd MMM yyyy hh:mm"), // for 26 May 2000
11:25
http://www.mail-archive.com/[email protected]/msg00531.html
new SimpleDateFormat("EEEEEEEEEE, MMMMMMMMMMMM dd, yyyy") //
Friday, July 11, 2008 https://issues.apache.org/jira/browse/PDFBOX-465
};
public static void main (String [] args) {
try {
Calendar
cal=SimpleDateParsingTest.toCalendar("20090401+0200");
System.out.println(cal.get(Calendar.YEAR));
cal=SimpleDateParsingTest.toCalendar("Friday, January
11, 2008");
System.out.println(cal.get(Calendar.YEAR));
System.out.println(cal);
cal=SimpleDateParsingTest.toCalendar("20090401+0200");
System.out.println(cal.get(Calendar.YEAR));
cal=SimpleDateParsingTest.toCalendar("20090401+02'00'");
System.out.println(cal.get(Calendar.YEAR));
cal=SimpleDateParsingTest.toCalendar("20090401+04'00'");
System.out.println(cal.get(Calendar.YEAR));
cal=SimpleDateParsingTest.toCalendar("20090401+09'00'");
System.out.println(cal.get(Calendar.YEAR));
cal=SimpleDateParsingTest.toCalendar("20090401-02'00'");
System.out.println(cal.get(Calendar.YEAR));
cal=SimpleDateParsingTest.toCalendar("20090401 01:01:01
-0500");
System.out.println(cal.get(Calendar.YEAR));
//yyyymmdd hh:mm:ss Z
cal=SimpleDateParsingTest.toCalendar("20090401");
System.out.println(cal.get(Calendar.YEAR));
System.out.println(SimpleDateParsingTest.toCalendar("20090401+0200"));
System.out.println(SimpleDateParsingTest.toCalendar("20090401+02'00'"));
System.out.println(SimpleDateParsingTest.toCalendar("26
May 2000 11:25:10"));
System.out.println(SimpleDateParsingTest.toCalendar("26
May 2000 11:25"));
System.out.println("Finished");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static Calendar toCalendar( String date ) throws IOException
{
Calendar retval = null;
for( int i=0; retval == null && i<POTENTIAL_FORMATS.length; i++
)
{
try
{
Date utilDate = POTENTIAL_FORMATS[i].parse(
date );
retval = new GregorianCalendar();
retval.setTime( utilDate );
System.out.println("i========="+i);
}
catch( ParseException pe )
{
//ignore and move to next potential format
}
}
if( retval == null )
{
//we didn't find a valid date format so throw an
exception
throw new IOException( "Error converting date:" + date
);
}
return retval;
}
}
> invalid date formats
> ---------------------
>
> Key: PDFBOX-465
> URL: https://issues.apache.org/jira/browse/PDFBOX-465
> Project: PDFBox
> Issue Type: Bug
> Components: Parsing
> Affects Versions: 0.8.0-incubator
> Reporter: Sean Bridges
> Attachments: SimpleDateParsingTest.java
>
>
> This is with the latest from svn, Revision: 773978
> From a sample of 13304 pdf documents generated in a very wide variety of
> ways, I got 94 invalid date formats,
> It seems that all of these have the stack trace of,
> Caused by: java.io.IOException: Error converting date:Friday, July 11, 2008
> at
> org.apache.pdfbox.util.DateConverter.toCalendar(DateConverter.java:240)
> at
> org.apache.pdfbox.util.DateConverter.toCalendar(DateConverter.java:120)
> at org.apache.pdfbox.cos.COSDictionary.getDate(COSDictionary.java:783)
> at
> org.apache.pdfbox.pdmodel.PDDocumentInformation.getCreationDate(PDDocumentInformation.java:218)
> at
> message_analyzer.extractor.PDFExtractor.getContent(PDFExtractor.java:50)
> Some examples of invalid dates are,
> 20070430193647+713'00'
> Tue Aug 21 10:35:22 2007
> Tuesday, November 04, 2008
> 200712172:2:3
> Unknown
> 20090319 200122
> 9:47 5/12/2008
> i don't think there is any hope of parsing all these date formats. If would
> be nice if this was not a fatal error, and the parser could continue without
> a creation date.
> Is the policy of pdfbox to be as forgiving as possible when reading pdf
> documents? Maybe toCalendar should return a new Calendar() if parsing fails,
> rather than throwing.
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.