Repository: tika Updated Branches: refs/heads/master cb492f4b1 -> 608fbf580
TIKA-1985 -- add charset handling to field names; add datetime processing; rework calculation of number of columns to handle extra zero-padding at end of header. Waiting on permission for test file. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/608fbf58 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/608fbf58 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/608fbf58 Branch: refs/heads/master Commit: 608fbf580addfe106f7ac392a067be23c455f07b Parents: cb492f4 Author: tballison <[email protected]> Authored: Wed May 25 15:34:33 2016 -0400 Committer: tballison <[email protected]> Committed: Wed May 25 15:34:33 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/parser/dbf/DBFCell.java | 31 ++++++++++++++++ .../apache/tika/parser/dbf/DBFColumnHeader.java | 16 ++++++-- .../apache/tika/parser/dbf/DBFFileHeader.java | 39 ++++++++++++-------- .../org/apache/tika/parser/dbf/DBFParser.java | 11 +++++- .../apache/tika/parser/dbf/DBFParserTest.java | 18 +++++++++ 5 files changed, 95 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/608fbf58/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java index 2a7dc26..785cd06 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java @@ -17,13 +17,20 @@ package org.apache.tika.parser.dbf; import org.apache.commons.io.IOUtils; +import org.apache.tika.io.EndianUtils; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.text.DateFormat; +import java.text.SimpleDateFormat; import java.util.Arrays; +import java.util.Calendar; +import java.util.GregorianCalendar; import java.util.Locale; +import java.util.TimeZone; class DBFCell { @@ -48,6 +55,8 @@ class DBFCell { return new String(getBytes(), StandardCharsets.US_ASCII).trim(); case L: return new String(getBytes(), StandardCharsets.US_ASCII).trim(); + case T: + return getFormattedDateTime(); default: //TODO: find examples of other cell types for testing return new String(getBytes(), StandardCharsets.US_ASCII).trim(); @@ -113,4 +122,26 @@ class DBFCell { return String.format(Locale.ROOT, "%s/%s/%s", month, day, year); } + + public String getFormattedDateTime() { + //sometimes 12/31/1899 instead of 01/01/4713 BC. + //http://stackoverflow.com/questions/20026154/convert-dbase-timestamp + //TODO: add heuristic for deciding; + //TODO: find example of file with time != 0 + Calendar baseCalendar = GregorianCalendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT); +// baseCalendar.set(1899, 11, 31, 0, 0, 0); + baseCalendar.set(-4712, 0, 1, 0, 0, 0); + byte[] bytes = getBytes(); + try (InputStream is = new ByteArrayInputStream(getBytes())) { + + int date = EndianUtils.readIntLE(is); + int time = EndianUtils.readIntLE(is); + baseCalendar.add(Calendar.DATE, date); + DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ROOT); + return df.format(baseCalendar.getTime()); + } catch (IOException|EndianUtils.BufferUnderrunException e) { + e.printStackTrace(); + } + return ""; + } } http://git-wip-us.apache.org/repos/asf/tika/blob/608fbf58/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java index ff6353a..7c3e52e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java @@ -16,11 +16,14 @@ */ package org.apache.tika.parser.dbf; +import java.nio.charset.Charset; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import static org.apache.tika.parser.dbf.DBFColumnHeader.ColType.AT; +import static org.apache.tika.parser.dbf.DBFColumnHeader.ColType.NULL; import static org.apache.tika.parser.dbf.DBFColumnHeader.ColType.PLUS; +import static org.json.zip.JSONzip.end; class DBFColumnHeader { @@ -41,6 +44,7 @@ class DBFColumnHeader { PLUS, //autoincrement AT, //timestamp dbase level 7 O, //double + NULL //null } private final static Map<Integer, ColType> COL_TYPE_MAP = new ConcurrentHashMap<>(); @@ -48,16 +52,18 @@ class DBFColumnHeader { static { for (ColType type : ColType.values()) { if (type.equals(PLUS)) { - COL_TYPE_MAP.put((int)'+', PLUS); + COL_TYPE_MAP.put((int) '+', PLUS); } else if (type.equals(AT)) { - COL_TYPE_MAP.put((int)'@', AT); + COL_TYPE_MAP.put((int) '@', AT); + } else if (type.equals(NULL)) { + COL_TYPE_MAP.put((int)'0', NULL); } else { COL_TYPE_MAP.put((int) type.toString().charAt(0), type); } } } - String name; + byte[] name; private ColType colType = null; int fieldLength = -1; int decimalCount = -1; @@ -74,6 +80,10 @@ class DBFColumnHeader { return colType; } + String getName(Charset charset) { + return new String(name, charset).trim(); + } + @Override public String toString() { return "DBFColumnHeader{" + http://git-wip-us.apache.org/repos/asf/tika/blob/608fbf58/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java index 8e376bc..bd89ccb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java @@ -65,12 +65,21 @@ class DBFFileHeader { header.numBytesInRecord = EndianUtils.readShortLE(is); IOUtils.skipFully(is, 20);//TODO: can get useful info out of here - int numCols = (header.numBytesInHeader - 32) / 32; - - header.cols = new DBFColumnHeader[numCols]; - for (int i = 0; i < numCols; i++) { - header.cols[i] = readCol(is); + int numCols = 0;//(header.numBytesInHeader - 32) / 32; + List<DBFColumnHeader> headers = new LinkedList<>(); + int bytesAccountedFor = 0; + while (true) { + DBFColumnHeader colHeader = readCol(is); + bytesAccountedFor += colHeader.fieldLength; + numCols++; + headers.add(colHeader); + if (bytesAccountedFor >= header.numBytesInRecord-1) { + break; + } } + + header.cols = headers.toArray(new DBFColumnHeader[headers.size()]); + int endOfHeader = is.read(); if (endOfHeader != 13) { throw new TikaException("Expected new line at end of header"); @@ -83,26 +92,26 @@ class DBFFileHeader { } private static DBFColumnHeader readCol(InputStream is) throws IOException, TikaException { - byte[] headerName = new byte[11]; - IOUtils.readFully(is, headerName); + byte[] fieldRecord = new byte[32]; + IOUtils.readFully(is, fieldRecord); + DBFColumnHeader col = new DBFColumnHeader(); - headerName = DBFReader.trim(headerName); - col.name = new String(headerName, StandardCharsets.US_ASCII); - int colType = is.read(); + col.name = new byte[11]; + System.arraycopy(fieldRecord, 0, col.name, 0, 10); + + int colType = fieldRecord[11] & 0xFF; if (colType < 0) { throw new IOException("File truncated before coltype in header"); } col.setType(colType); - IOUtils.skipFully(is, 4);//field data address - col.fieldLength = is.read(); + col.fieldLength = fieldRecord[16] & 0xFF; if (col.fieldLength < 0) { - throw new TikaException("Field length for column "+headerName+"is < 0"); + throw new TikaException("Field length for column "+col.getName(StandardCharsets.US_ASCII)+" is < 0"); } else if (col.fieldLength > DBFReader.MAX_FIELD_LENGTH) { throw new TikaException("Field length ("+col.fieldLength+") is greater than DBReader.MAX_FIELD_LENGTH ("+ DBFReader.MAX_FIELD_LENGTH+")"); } - col.decimalCount = is.read(); - IOUtils.skipFully(is, 14); //TODO: might have useful info in some versions + col.decimalCount = fieldRecord[17] & 0xFF; return col; } http://git-wip-us.apache.org/repos/asf/tika/blob/608fbf58/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java index 35f3b12..6b77d3d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java @@ -16,7 +16,10 @@ */ package org.apache.tika.parser.dbf; +import org.apache.fontbox.encoding.Encoding; import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.detect.Detector; +import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -24,10 +27,12 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.txt.Icu4jEncodingDetector; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; @@ -91,7 +96,7 @@ public class DBFParser extends AbstractParser { xhtml.startElement("thead"); for (DBFColumnHeader col : header.getCols()) { xhtml.startElement("th"); - xhtml.characters(col.name); + xhtml.characters(col.getName(charset)); xhtml.endElement("th"); } xhtml.endElement("thead"); @@ -132,7 +137,9 @@ public class DBFParser extends AbstractParser { } byte[] bytes = bos.toByteArray(); if (bytes.length > 20) { - charset = new AutoDetectReader(TikaInputStream.get(bytes)).getCharset(); + EncodingDetector detector = new Icu4jEncodingDetector(); + detector.detect(TikaInputStream.get(bytes), new Metadata()); + charset = detector.detect(new ByteArrayInputStream(bytes), new Metadata()); } return charset; } http://git-wip-us.apache.org/repos/asf/tika/blob/608fbf58/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java index a531c55..9d15e44 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java @@ -22,12 +22,17 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; @@ -136,6 +141,19 @@ public class DBFParserTest extends TikaTest { } } +/* +commented out until we get permission to add the test file + @Test + public void testEncodingInHeaderAndDateTime() throws Exception { + XMLResult r = getXML("prem2007_2.dbf"); + String xml = r.xml.replaceAll("[\\r\\n\\t]", " "); + assertEquals("application/x-dbf; dbf_version=Visual_FoxPro", r.metadata.get(Metadata.CONTENT_TYPE)); + assertContains("<th>èãæ¤é¢</th>", xml);//header + assertContains("<td>é½ è¤</td>", xml);//content + assertContains("<td>2010-04-20T00:00:00Z</td>", xml); + } + */ + InputStream truncate(String testFileName, int length) throws IOException { byte[] bytes = new byte[length]; try (InputStream is = getResourceAsStream("/test-documents/"+testFileName)) {
