Repository: tika Updated Branches: refs/heads/master fc4f13dde -> cb492f4b1
TIKA-1513 -- add mime detection and parsing for dbf files. Thanks to Nick C for the mime definition and Luis Filipe Nassif for collaboration. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e74f6637 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e74f6637 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e74f6637 Branch: refs/heads/master Commit: e74f66375f20d914f8585597b6d9492586a0caa9 Parents: bb46c0e Author: tballison <[email protected]> Authored: Wed May 25 12:29:00 2016 -0400 Committer: tballison <[email protected]> Committed: Wed May 25 12:29:00 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/mime/tika-mimetypes.xml | 9 + .../org/apache/tika/parser/dbf/DBFCell.java | 116 ++++++++++++ .../apache/tika/parser/dbf/DBFColumnHeader.java | 86 +++++++++ .../apache/tika/parser/dbf/DBFFileHeader.java | 135 +++++++++++++ .../org/apache/tika/parser/dbf/DBFParser.java | 150 +++++++++++++++ .../org/apache/tika/parser/dbf/DBFReader.java | 189 +++++++++++++++++++ .../java/org/apache/tika/parser/dbf/DBFRow.java | 62 ++++++ .../services/org.apache.tika.parser.Parser | 3 +- .../apache/tika/parser/dbf/DBFParserTest.java | 146 ++++++++++++++ .../test/resources/test-documents/testDBF.dbf | Bin 0 -> 890 bytes .../test-documents/testDBF_gb18030.dbf | Bin 0 -> 144 bytes 11 files changed, 895 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index ca9828c..8a79844 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -115,6 +115,15 @@ <mime-type type="application/davmount+xml"> <glob pattern="*.davmount"/> </mime-type> + <mime-type type="application/x-dbf"> + <magic priority="100"> + <match value="(?s)^[\\x02\\x03\\x30\\x31\\x32\\x43\\x63\\x83\\x8B\\xCB\\xF5\\xE5\\xFB].[\\x01-\\x0C][\\x01-\\x1F].{4}(?:.[^\\x00]|[\\x41-\\xFF].)(?:[^\\x00\\x01].|.[^\\x00]).{31}(?<=[\\x00][^\\x00]{0,10})[A-Z@+]" type="regex" offset="0"/> + </magic> + <glob pattern="*.dbf"/> + <glob pattern="*.dbase"/> + <glob pattern="*.dbase3"/> + </mime-type> + <mime-type type="application/dca-rft"/> <mime-type type="application/dec-dx"/> <mime-type type="application/dialog-info+xml"/> http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java new file mode 100644 index 0000000..2a7dc26 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dbf; + +import org.apache.commons.io.IOUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Locale; + +class DBFCell { + + private final DBFColumnHeader.ColType colType; + private final byte[] bytes; + private final int decimalCount; + int bytesReadLast = 0; + + DBFCell(DBFColumnHeader.ColType colType, int fieldLength, int decimalCount) { + this.colType = colType; + this.decimalCount = decimalCount; + this.bytes = new byte[fieldLength]; + } + + String getString(Charset charset) { + switch (colType) { + case C: + return new String(getBytes(), charset).trim(); + case D: + return getFormattedDate(); + case N: + return new String(getBytes(), StandardCharsets.US_ASCII).trim(); + case L: + return new String(getBytes(), StandardCharsets.US_ASCII).trim(); + default: + //TODO: find examples of other cell types for testing + return new String(getBytes(), StandardCharsets.US_ASCII).trim(); + } + } + + //returns whether any content was read + boolean read(InputStream is) throws IOException { + bytesReadLast = IOUtils.read(is, bytes); + if (DBFReader.STRICT && bytesReadLast != bytes.length) { + throw new IOException("Truncated record, only read "+bytesReadLast+ + " bytes, but should have read: "+bytes.length); + } + return bytesReadLast > 0; + } + + /** + * + * @return copy of bytes that were read on the last read + */ + byte[] getBytes() { + byte[] ret = new byte[bytesReadLast]; + System.arraycopy(bytes, 0, ret, 0, bytesReadLast); + return ret; + } + + DBFColumnHeader.ColType getColType() { + return colType; + } + + @Override + public String toString() { + return "DBFCell{" + + "colType=" + colType + + ", bytes=" + Arrays.toString(bytes) + + ", decimalCount=" + decimalCount + + '}'; + } + + DBFCell deepCopy() { + DBFCell cell = new DBFCell(colType, bytes.length, decimalCount); + cell.bytesReadLast = this.bytesReadLast; + System.arraycopy(this.bytes, 0, cell.bytes, 0, bytesReadLast); + return cell; + } + + private String getFormattedDate() { + byte[] dateBytes = getBytes(); + if (dateBytes.length < 8) { + return ""; + } + String year = new String(dateBytes, 0, 4, StandardCharsets.US_ASCII); + String month = new String(dateBytes, 4, 2, StandardCharsets.US_ASCII); + String day = new String(dateBytes, 6, 2, StandardCharsets.US_ASCII); + //test to see that these values make any sense + for (String s : new String[]{year, month, day}) { + try { + Integer.parseInt(s); + } catch (NumberFormatException e) { + return ""; + } + } + return String.format(Locale.ROOT, + "%s/%s/%s", month, day, year); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java new file mode 100644 index 0000000..ff6353a --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dbf; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import static org.apache.tika.parser.dbf.DBFColumnHeader.ColType.AT; +import static org.apache.tika.parser.dbf.DBFColumnHeader.ColType.PLUS; + +class DBFColumnHeader { + + //from: http://www.dbf2002.com/dbf-file-format.html + enum ColType { + C,//character + Y,//currency + D,//date + T,//datetime + B,//double + I,//integer, + G,//general + P,//picture + F,//floating point binary numeric + L,//logical + M,//memo + N,//binary coded decimal numeric + PLUS, //autoincrement + AT, //timestamp dbase level 7 + O, //double + } + private final static Map<Integer, ColType> COL_TYPE_MAP = + new ConcurrentHashMap<>(); + + static { + for (ColType type : ColType.values()) { + if (type.equals(PLUS)) { + COL_TYPE_MAP.put((int)'+', PLUS); + } else if (type.equals(AT)) { + COL_TYPE_MAP.put((int)'@', AT); + } else { + COL_TYPE_MAP.put((int) type.toString().charAt(0), type); + } + } + } + + String name; + private ColType colType = null; + int fieldLength = -1; + int decimalCount = -1; + + public void setType(int type) { + colType = COL_TYPE_MAP.get(type); + if (colType == null) { + throw new IllegalArgumentException("Unrecognized column type for column: " + name + + ". I regret I don't recognize: " + (char) type); + } + } + + ColType getColType() { + return colType; + } + + @Override + public String toString() { + return "DBFColumnHeader{" + + "name='" + name + '\'' + + ", colType=" + colType + + ", fieldLength=" + fieldLength + + ", decimalCount=" + decimalCount + + '}'; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java new file mode 100644 index 0000000..8e376bc --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dbf; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.EndianUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.*; + +class DBFFileHeader { + + private DBFReader.Version version; + private Calendar lastModified; + private int numRecords = -1; + private short numBytesInHeader; + private short numBytesInRecord; + private DBFColumnHeader[] cols; + + public static DBFFileHeader parse(InputStream is) throws IOException, TikaException { + DBFFileHeader header = new DBFFileHeader(); + + int firstByte = is.read(); + header.version = DBFReader.getVersion(firstByte); + if (header.version == null) { + throw new TikaException("Unrecognized first byte in DBFFile: " + firstByte); + } + int lastModYear = is.read(); + int lastModMonth = is.read(); + int lastModDay = is.read(); + Calendar now = GregorianCalendar.getInstance( + TimeZone.getTimeZone("UTC"), Locale.ROOT); + + //if this was last modified after the current year, assume + //the file was created in 1900 + if (lastModYear + 2000 > now.get(Calendar.YEAR)) { + lastModYear += 1900; + } else { + lastModYear += 2000; + } + Calendar lastModified = new GregorianCalendar( + TimeZone.getTimeZone("UTC"), Locale.ROOT); + lastModified.set(lastModYear, lastModMonth - 1, lastModDay,0,0,0); + header.lastModified = lastModified; + + header.numRecords = EndianUtils.readIntLE(is); + header.numBytesInHeader = EndianUtils.readShortLE(is); + header.numBytesInRecord = EndianUtils.readShortLE(is); + IOUtils.skipFully(is, 20);//TODO: can get useful info out of here + + int numCols = (header.numBytesInHeader - 32) / 32; + + header.cols = new DBFColumnHeader[numCols]; + for (int i = 0; i < numCols; i++) { + header.cols[i] = readCol(is); + } + int endOfHeader = is.read(); + if (endOfHeader != 13) { + throw new TikaException("Expected new line at end of header"); + } + long totalReadSoFar = 32 + (numCols * 32) + 1; + //there can be extra bytes in the header + long extraHeaderBytes = header.numBytesInHeader - totalReadSoFar; + IOUtils.skipFully(is, extraHeaderBytes); + return header; + } + + private static DBFColumnHeader readCol(InputStream is) throws IOException, TikaException { + byte[] headerName = new byte[11]; + IOUtils.readFully(is, headerName); + DBFColumnHeader col = new DBFColumnHeader(); + headerName = DBFReader.trim(headerName); + col.name = new String(headerName, StandardCharsets.US_ASCII); + int colType = is.read(); + if (colType < 0) { + throw new IOException("File truncated before coltype in header"); + } + col.setType(colType); + IOUtils.skipFully(is, 4);//field data address + col.fieldLength = is.read(); + if (col.fieldLength < 0) { + throw new TikaException("Field length for column "+headerName+"is < 0"); + } else if (col.fieldLength > DBFReader.MAX_FIELD_LENGTH) { + throw new TikaException("Field length ("+col.fieldLength+") is greater than DBReader.MAX_FIELD_LENGTH ("+ + DBFReader.MAX_FIELD_LENGTH+")"); + } + col.decimalCount = is.read(); + IOUtils.skipFully(is, 14); //TODO: might have useful info in some versions + return col; + } + + DBFColumnHeader[] getCols() { + return cols; + } + + int getNumRecords() { + return numRecords; + } + + Calendar getLastModified() { + return lastModified; + } + + DBFReader.Version getVersion() { + return version; + } + + @Override + public String toString() { + return "DBFFileHeader{" + + "lastModified=" + lastModified + + ", numRecords=" + numRecords + + ", numBytesInHeader=" + numBytesInHeader + + ", numBytesInRecord=" + numBytesInRecord + + ", cols=" + Arrays.toString(cols) + + '}'; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java new file mode 100644 index 0000000..35f3b12 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dbf; + +import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.*; + +/** + * This is a Tika wrapper around the DBFReader. + * <p> + * This reads many dbase3 file variants (not DBASE 7, yet!). + * <p> + * It caches the first 10 rows and then runs encoding dectection + * on the "character" cells. + */ +public class DBFParser extends AbstractParser { + + public static final String DBF_VERSION_MIME_ATTRIBUTE = "dbf_version"; + private static final int ROWS_TO_BUFFER_FOR_CHARSET_DETECTION = 10; + private static final int MAX_CHARS_FOR_CHARSET_DETECTION = 20000; + private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("x-dbf")); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + DBFReader reader = DBFReader.open(stream); + DBFFileHeader header = reader.getHeader(); + + metadata.set(Metadata.CONTENT_TYPE, "application/x-dbf; "+ + DBF_VERSION_MIME_ATTRIBUTE+"="+header.getVersion().getName()); + + //insert metadata here + Calendar lastModified = header.getLastModified(); + if (lastModified != null) { + metadata.set(TikaCoreProperties.MODIFIED, lastModified); + } + + //buffer first X rows for charset detection + List<DBFRow> firstRows = new LinkedList<>(); + DBFRow row = reader.next(); + int i = 0; + while(row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) { + firstRows.add(row.deepCopy()); + row = reader.next(); + } + + Charset charset = getCharset(firstRows, header); + metadata.set(Metadata.CONTENT_ENCODING, charset.toString()); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.startElement("table"); + xhtml.startElement("thead"); + for (DBFColumnHeader col : header.getCols()) { + xhtml.startElement("th"); + xhtml.characters(col.name); + xhtml.endElement("th"); + } + xhtml.endElement("thead"); + + xhtml.startElement("tbody"); + + //now write cached rows + while (firstRows.size() > 0) { + DBFRow cachedRow = firstRows.remove(0); + writeRow(cachedRow, charset, xhtml); + } + + //now continue with rest + while (row != null) { + writeRow(row, charset, xhtml); + row = reader.next(); + } + xhtml.endElement("tbody"); + xhtml.endElement("table"); + xhtml.endDocument(); + } + + private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header) throws IOException, + TikaException { + //TODO: potentially use codepage info in the header + Charset charset = DEFAULT_CHARSET; + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + for (DBFRow row : firstRows) { + for (DBFCell cell : row.cells) { + if (cell.getColType().equals(DBFColumnHeader.ColType.C)) { + byte[] bytes = cell.getBytes(); + bos.write(bytes); + if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) { + break; + } + } + } + } + byte[] bytes = bos.toByteArray(); + if (bytes.length > 20) { + charset = new AutoDetectReader(TikaInputStream.get(bytes)).getCharset(); + } + return charset; + } + + private void writeRow(DBFRow row, Charset charset, XHTMLContentHandler xhtml) throws SAXException { + xhtml.startElement("tr"); + for (DBFCell cell : row.cells) { + xhtml.startElement("td"); + xhtml.characters(cell.getString(charset)); + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java new file mode 100644 index 0000000..961244a --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dbf; + +import org.apache.tika.exception.TikaException; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * This reads many dbase3 file variants (not DBASE 7, yet!). + * This parses the header on open. The client + * should get a row and then iterate until next() returns null. + * Be careful to deepCopy the row (if caching) because the row + * is mutable and will change as the reader iterates over new rows. + * <p> + * This is based on: <a href="http://web.archive.org/web/20150323061445/http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm"> + * http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm</a> + * <p> + * This is designed to separate out Tika-specific code so that it can + * be copied/pasted as a standalone if desired. + */ + +class DBFReader { + + public static final int MAX_FIELD_LENGTH = 66000; + public static boolean STRICT = false; + + + enum Version { + + FOXBASE(0x02, "FoxBASE"), + FOXBASE_PLUS(0x03, "FoxBASE_plus"), + VISUAL_FOXPRO(0x30, "Visual_FoxPro"), + VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro_autoincrement"), + VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro_with_Varchar_or_Varbinary"), + DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL_table"), + DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL_system"), + FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus_with_memo"), + DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV_with_memo"), + DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL_table_with_memo"), + FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x_with_memo"), + HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz_with_SMT_memo"), + FOXBASE2(0xFB, "FoxBASE"); + + private final int id; + private final String name; + + Version(int id, String name) { + this.id = id; + this.name = name; + } + + int getId() { + return id; + } + + String getName() { + return name; + } + }; + + private static final Map<Integer, Version> VERSION_MAP = new ConcurrentHashMap<>(); + static { + for (Version version : Version.values()) { + VERSION_MAP.put(version.id, version); + } + } + + static DBFReader open(InputStream is) throws IOException, TikaException { + return new DBFReader(is); + } + + //can return null! + static Version getVersion(int b) { + return VERSION_MAP.get(b); + } + + private final DBFFileHeader header; + private final InputStream is; + private DBFRow currRow = null; + private Charset charset = StandardCharsets.US_ASCII; + + private DBFReader(InputStream is) throws IOException, TikaException { + header = DBFFileHeader.parse(is); + this.is = is; + currRow = new DBFRow(header); + } + + + /** + * Iterate through the rows with this. + * <p> + * Be careful: the reader reuses the row! Make sure to call deep copy + * if you are buffering rows. + * + * @return + * @throws IOException + * @throws TikaException + */ + DBFRow next() throws IOException, TikaException { + if (fillRow(currRow)) { + return currRow; + } + return null; + } + + //returns whether or not some content was read. + //it might not be complete! + private boolean fillRow(DBFRow row) throws IOException, TikaException { + if (row == null) { + return false; + } + DBFCell[] cells = row.cells; + int isDeletedByte = is.read(); + boolean isDeleted = false; + if (isDeletedByte == 32) { + //all ok + } else if (isDeletedByte == 42) {//asterisk + isDeleted = true; + } else if (isDeletedByte == 26) {//marker for end of dbf file + return false; + } else if (isDeletedByte == -1) {//truncated file + if (DBFReader.STRICT) { + throw new IOException("EOF reached too early"); + } + return false; + } else { + throw new TikaException("Expecting space or asterisk at beginning of record, not:" + isDeletedByte); + } + row.setDeleted(isDeleted); + + boolean readSomeContent = false; + for (int i = 0; i < cells.length; i++) { + if (cells[i].read(is)) { + readSomeContent = true; + } + } + return readSomeContent; + } + + public DBFFileHeader getHeader() { + return header; + } + + public Charset getCharset() { + return charset; + } + + /** + * removes trailing 0 from byte array + * + * @param bytes + * @return + */ + public static byte[] trim(byte[] bytes) { + int end = bytes.length - 1; + for (int i = end; i > -1; i--) { + if (bytes[i] != 0) { + end = i; + break; + } + } + if (end == bytes.length - 1) { + return bytes; + } + byte[] ret = new byte[end + 1]; + System.arraycopy(bytes, 0, ret, 0, end + 1); + return ret; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFRow.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFRow.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFRow.java new file mode 100644 index 0000000..6a400d7 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFRow.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dbf; + +import java.util.Arrays; + +class DBFRow { + + DBFCell[] cells; + private boolean isDeleted = false; + + DBFRow(DBFFileHeader header) { + cells = new DBFCell[header.getCols().length]; + for (int i = 0; i < cells.length; i++) { + DBFColumnHeader columnHeader = header.getCols()[i]; + cells[i] = new DBFCell(columnHeader.getColType(), + columnHeader.fieldLength, + columnHeader.decimalCount); + } + } + + private DBFRow() {} + + void setDeleted(boolean deleted) { + isDeleted = deleted; + } + + boolean isDeleted() { + return isDeleted; + } + + DBFRow deepCopy() { + DBFRow row = new DBFRow(); + row.isDeleted = this.isDeleted; + row.cells = new DBFCell[cells.length]; + for (int i = 0; i < cells.length; i++) { + row.cells[i] = cells[i].deepCopy(); + } + return row; + } + + @Override + public String toString() { + return "DBFRow{" + + "cells=" + Arrays.toString(cells) + + '}'; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index e3238ef..acb0224 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -70,4 +70,5 @@ org.apache.tika.parser.geoinfo.GeographicInformationParser org.apache.tika.parser.geo.topic.GeoParser org.apache.tika.parser.external.CompositeExternalParser org.apache.tika.parser.journal.JournalParser -org.apache.tika.parser.image.ICNSParser \ No newline at end of file +org.apache.tika.parser.image.ICNSParser +org.apache.tika.parser.dbf.DBFParser \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java new file mode 100644 index 0000000..a531c55 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dbf; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; + +import static junit.framework.TestCase.assertEquals; +import static org.junit.Assert.fail; + +public class DBFParserTest extends TikaTest { + + @Test + public void testBasic() throws Exception { + XMLResult r = getXML("testDBF.dbf"); + assertEquals("application/x-dbf; dbf_version=FoxBASE_plus", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("2016-05-24T00:00:00Z", r.metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("UTF-8", r.metadata.get(Metadata.CONTENT_ENCODING)); + + String xml = r.xml.replaceAll("[\\t\\r\\n]", " "); + //header + assertContains("<thead> <th>TEXT_FIELD</th> <th>NUMERIC_FI</th> <th>DATE_FIELD</th></thead>", + xml); + //look for contents + assertContains("æ®ææ¯é¡¿å¤§å¦", xml); + assertContains("\u0627\u0645\u0639\u0629", xml); + assertContains("05/26/2016", xml); + assertContains("<td>4.0</td>", xml); + //make sure there is no problem around row 10 + //where we're buffering + assertContains("<td>8.0</td>", xml); + assertContains("<td>9.0</td>", xml); + assertContains("<td>10.0</td>", xml); + assertContains("<td>11.0</td>", xml); + assertContains("<td>licour</td>", xml); + } + + @Test + public void testGB18030Encoded() throws Exception { + XMLResult r = getXML("testDBF_gb18030.dbf"); + assertEquals("application/x-dbf; dbf_version=FoxBASE_plus", r.metadata.get(Metadata.CONTENT_TYPE)); + assertContains("è½ç¶è¯¥", r.xml); + } + + @Test + public void testTruncated() throws Exception { + Parser p = new DBFParser(); + //should throw exception for truncation in header + for (int i = 1; i < 129; i++) { + try { + XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata()); + fail("Should have thrown exception for truncation in header: "+i); + } catch (IOException|TikaException e) { + //ok -- expected + } catch (Throwable e) { + fail("Should only throw IOExceptions or TikaExceptions"); + } + } + //default don't throw exception for truncation while reading body + for (int i = 129; i < 204; i++) { + try { + XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata()); + } catch (IOException|TikaException e) { + fail("Shouldn't have thrown exception for truncation while reading cells: "+i); + e.printStackTrace(); + } + } + try { + DBFReader.STRICT = true; + //if strict is true throw exception for truncation in body + for (int i = 129; i < 204; i++) { + try { + XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata()); + fail("Should have thrown exception for truncation while reading cells: " + i); + } catch (IOException | TikaException e) { + } + } + } finally { + //reset for other tests + DBFReader.STRICT = false; + } + } + + @Test + public void testSpecificTruncated() throws Exception { + XMLResult r = getXML(truncate("testDBF.dbf", 781), new AutoDetectParser(), new Metadata()); + String xml = r.xml.replaceAll("[\\t\\r\\n]", " "); + + //if you don't keep track of bytes read, you could get content from prev row + assertNotContained("holt red hath in every", xml); + assertNotContained("<td>holt</td> <td>18.0</td>", xml); + //check that the last row ends with holt but is correctly formatted + assertContains("<td>holt</td> <td /> <td /></tr>", xml); + } + + @Test + public void testVariants() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try (InputStream is = getResourceAsStream("/test-documents/testDBF.dbf")) { + IOUtils.copy(is, bos); + } + byte[] bytes = bos.toByteArray(); + + for (DBFReader.Version version : DBFReader.Version.values()) { + //this cast happens to work because of the range of possible values + bytes[0] = (byte)version.getId(); + XMLResult r = getXML(TikaInputStream.get(bytes), new AutoDetectParser(), new Metadata()); + assertEquals("application/x-dbf; "+ + DBFParser.DBF_VERSION_MIME_ATTRIBUTE+"="+version.getName(), r.metadata.get(Metadata.CONTENT_TYPE)); + } + } + + InputStream truncate(String testFileName, int length) throws IOException { + byte[] bytes = new byte[length]; + try (InputStream is = getResourceAsStream("/test-documents/"+testFileName)) { + IOUtils.readFully(is, bytes); + } + return new ByteArrayInputStream(bytes); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/test/resources/test-documents/testDBF.dbf ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testDBF.dbf b/tika-parsers/src/test/resources/test-documents/testDBF.dbf new file mode 100644 index 0000000..54b8aca Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testDBF.dbf differ http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/test/resources/test-documents/testDBF_gb18030.dbf ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testDBF_gb18030.dbf b/tika-parsers/src/test/resources/test-documents/testDBF_gb18030.dbf new file mode 100644 index 0000000..fdf20a5 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testDBF_gb18030.dbf differ
