[1/3] tika git commit: TIKA-1513 -- add mime detection and parsing for dbf files. Thanks to Nick C for the mime definition and Luis Filipe Nassif for collaboration.

tallison Wed, 25 May 2016 09:48:32 -0700

Repository: tika
Updated Branches:
  refs/heads/master fc4f13dde -> cb492f4b1



TIKA-1513 -- add mime detection and parsing for dbf files. Thanks to Nick C for 
the mime definition and Luis Filipe Nassif for collaboration.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e74f6637
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e74f6637
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e74f6637

Branch: refs/heads/master
Commit: e74f66375f20d914f8585597b6d9492586a0caa9
Parents: bb46c0e
Author: tballison <[email protected]>
Authored: Wed May 25 12:29:00 2016 -0400
Committer: tballison <[email protected]>
Committed: Wed May 25 12:29:00 2016 -0400

----------------------------------------------------------------------
 .../org/apache/tika/mime/tika-mimetypes.xml     |   9 +
 .../org/apache/tika/parser/dbf/DBFCell.java     | 116 ++++++++++++
 .../apache/tika/parser/dbf/DBFColumnHeader.java |  86 +++++++++
 .../apache/tika/parser/dbf/DBFFileHeader.java   | 135 +++++++++++++
 .../org/apache/tika/parser/dbf/DBFParser.java   | 150 +++++++++++++++
 .../org/apache/tika/parser/dbf/DBFReader.java   | 189 +++++++++++++++++++
 .../java/org/apache/tika/parser/dbf/DBFRow.java |  62 ++++++
 .../services/org.apache.tika.parser.Parser      |   3 +-
 .../apache/tika/parser/dbf/DBFParserTest.java   | 146 ++++++++++++++
 .../test/resources/test-documents/testDBF.dbf   | Bin 0 -> 890 bytes
 .../test-documents/testDBF_gb18030.dbf          | Bin 0 -> 144 bytes
 11 files changed, 895 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index ca9828c..8a79844 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -115,6 +115,15 @@
   <mime-type type="application/davmount+xml">
     <glob pattern="*.davmount"/>
   </mime-type>
+  <mime-type type="application/x-dbf">
+    <magic priority="100">
+      <match 
value="(?s)^[\\x02\\x03\\x30\\x31\\x32\\x43\\x63\\x83\\x8B\\xCB\\xF5\\xE5\\xFB].[\\x01-\\x0C][\\x01-\\x1F].{4}(?:.[^\\x00]|[\\x41-\\xFF].)(?:[^\\x00\\x01].|.[^\\x00]).{31}(?&lt;=[\\x00][^\\x00]{0,10})[A-Z@+]"
 type="regex" offset="0"/>
+    </magic>
+    <glob pattern="*.dbf"/>
+    <glob pattern="*.dbase"/>
+    <glob pattern="*.dbase3"/>
+  </mime-type>
+
   <mime-type type="application/dca-rft"/>
   <mime-type type="application/dec-dx"/>
   <mime-type type="application/dialog-info+xml"/>

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
new file mode 100644
index 0000000..2a7dc26
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dbf;
+
+import org.apache.commons.io.IOUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Locale;
+
+class DBFCell {
+
+    private final DBFColumnHeader.ColType colType;
+    private final byte[] bytes;
+    private final int decimalCount;
+    int bytesReadLast = 0;
+
+    DBFCell(DBFColumnHeader.ColType colType, int fieldLength, int 
decimalCount) {
+        this.colType = colType;
+        this.decimalCount = decimalCount;
+        this.bytes = new byte[fieldLength];
+    }
+
+    String getString(Charset charset) {
+        switch (colType) {
+            case C:
+                return new String(getBytes(), charset).trim();
+            case D:
+                return getFormattedDate();
+            case N:
+                return new String(getBytes(), 
StandardCharsets.US_ASCII).trim();
+            case L:
+                return new String(getBytes(), 
StandardCharsets.US_ASCII).trim();
+            default:
+                //TODO: find examples of other cell types for testing
+                return new String(getBytes(), 
StandardCharsets.US_ASCII).trim();
+        }
+    }
+
+    //returns whether any content was read
+    boolean read(InputStream is) throws IOException {
+        bytesReadLast = IOUtils.read(is, bytes);
+        if (DBFReader.STRICT && bytesReadLast != bytes.length) {
+            throw new IOException("Truncated record, only read "+bytesReadLast+
+                    " bytes, but should have read: "+bytes.length);
+        }
+        return bytesReadLast > 0;
+    }
+
+    /**
+     *
+     * @return copy of bytes that were read on the last read
+     */
+    byte[] getBytes() {
+        byte[] ret = new byte[bytesReadLast];
+        System.arraycopy(bytes, 0, ret, 0, bytesReadLast);
+        return ret;
+    }
+
+    DBFColumnHeader.ColType getColType() {
+        return colType;
+    }
+
+    @Override
+    public String toString() {
+        return "DBFCell{" +
+                "colType=" + colType +
+                ", bytes=" + Arrays.toString(bytes) +
+                ", decimalCount=" + decimalCount +
+                '}';
+    }
+
+    DBFCell deepCopy() {
+        DBFCell cell = new DBFCell(colType, bytes.length, decimalCount);
+        cell.bytesReadLast = this.bytesReadLast;
+        System.arraycopy(this.bytes, 0, cell.bytes, 0, bytesReadLast);
+        return cell;
+    }
+
+    private String getFormattedDate() {
+        byte[] dateBytes = getBytes();
+        if (dateBytes.length < 8) {
+            return "";
+        }
+        String year = new String(dateBytes, 0, 4, StandardCharsets.US_ASCII);
+        String month = new String(dateBytes, 4, 2, StandardCharsets.US_ASCII);
+        String day = new String(dateBytes, 6, 2, StandardCharsets.US_ASCII);
+        //test to see that these values make any sense
+        for (String s : new String[]{year, month, day}) {
+            try {
+                Integer.parseInt(s);
+            } catch (NumberFormatException e) {
+                return "";
+            }
+        }
+        return String.format(Locale.ROOT,
+                "%s/%s/%s", month, day, year);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java
new file mode 100644
index 0000000..ff6353a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFColumnHeader.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dbf;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import static org.apache.tika.parser.dbf.DBFColumnHeader.ColType.AT;
+import static org.apache.tika.parser.dbf.DBFColumnHeader.ColType.PLUS;
+
+class DBFColumnHeader {
+
+    //from: http://www.dbf2002.com/dbf-file-format.html
+    enum ColType {
+        C,//character
+        Y,//currency
+        D,//date
+        T,//datetime
+        B,//double
+        I,//integer,
+        G,//general
+        P,//picture
+        F,//floating point binary numeric
+        L,//logical
+        M,//memo
+        N,//binary coded decimal numeric
+        PLUS, //autoincrement
+        AT, //timestamp dbase level 7
+        O, //double
+    }
+    private final static Map<Integer, ColType> COL_TYPE_MAP =
+            new ConcurrentHashMap<>();
+
+    static {
+        for (ColType type : ColType.values()) {
+            if (type.equals(PLUS)) {
+                COL_TYPE_MAP.put((int)'+', PLUS);
+            } else if (type.equals(AT)) {
+                COL_TYPE_MAP.put((int)'@', AT);
+            } else {
+                COL_TYPE_MAP.put((int) type.toString().charAt(0), type);
+            }
+        }
+    }
+
+    String name;
+    private ColType colType = null;
+    int fieldLength = -1;
+    int decimalCount = -1;
+
+    public void setType(int type) {
+        colType = COL_TYPE_MAP.get(type);
+        if (colType == null) {
+            throw new IllegalArgumentException("Unrecognized column type for 
column: " + name +
+                    ". I regret I don't recognize: " + (char) type);
+        }
+    }
+
+    ColType getColType() {
+        return colType;
+    }
+
+    @Override
+    public String toString() {
+        return "DBFColumnHeader{" +
+                "name='" + name + '\'' +
+                ", colType=" + colType +
+                ", fieldLength=" + fieldLength +
+                ", decimalCount=" + decimalCount +
+                '}';
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java
new file mode 100644
index 0000000..8e376bc
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFFileHeader.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dbf;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+
+class DBFFileHeader {
+
+    private DBFReader.Version version;
+    private Calendar lastModified;
+    private int numRecords = -1;
+    private short numBytesInHeader;
+    private short numBytesInRecord;
+    private DBFColumnHeader[] cols;
+
+    public static DBFFileHeader parse(InputStream is) throws IOException, 
TikaException {
+        DBFFileHeader header = new DBFFileHeader();
+
+        int firstByte = is.read();
+        header.version = DBFReader.getVersion(firstByte);
+        if (header.version == null) {
+            throw new TikaException("Unrecognized first byte in DBFFile: " + 
firstByte);
+        }
+        int lastModYear = is.read();
+        int lastModMonth = is.read();
+        int lastModDay = is.read();
+        Calendar now = GregorianCalendar.getInstance(
+                TimeZone.getTimeZone("UTC"), Locale.ROOT);
+
+        //if this was last modified after the current year, assume
+        //the file was created in 1900
+        if (lastModYear + 2000 > now.get(Calendar.YEAR)) {
+            lastModYear += 1900;
+        } else {
+            lastModYear += 2000;
+        }
+        Calendar lastModified = new GregorianCalendar(
+                TimeZone.getTimeZone("UTC"), Locale.ROOT);
+        lastModified.set(lastModYear, lastModMonth - 1, lastModDay,0,0,0);
+        header.lastModified = lastModified;
+
+        header.numRecords = EndianUtils.readIntLE(is);
+        header.numBytesInHeader = EndianUtils.readShortLE(is);
+        header.numBytesInRecord = EndianUtils.readShortLE(is);
+        IOUtils.skipFully(is, 20);//TODO: can get useful info out of here
+
+        int numCols = (header.numBytesInHeader - 32) / 32;
+
+        header.cols = new DBFColumnHeader[numCols];
+        for (int i = 0; i < numCols; i++) {
+            header.cols[i] = readCol(is);
+        }
+        int endOfHeader = is.read();
+        if (endOfHeader != 13) {
+            throw new TikaException("Expected new line at end of header");
+        }
+        long totalReadSoFar = 32 + (numCols * 32) + 1;
+        //there can be extra bytes in the header
+        long extraHeaderBytes = header.numBytesInHeader - totalReadSoFar;
+        IOUtils.skipFully(is, extraHeaderBytes);
+        return header;
+    }
+
+    private static DBFColumnHeader readCol(InputStream is) throws IOException, 
TikaException {
+        byte[] headerName = new byte[11];
+        IOUtils.readFully(is, headerName);
+        DBFColumnHeader col = new DBFColumnHeader();
+        headerName = DBFReader.trim(headerName);
+        col.name = new String(headerName, StandardCharsets.US_ASCII);
+        int colType = is.read();
+        if (colType < 0) {
+            throw new IOException("File truncated before coltype in header");
+        }
+        col.setType(colType);
+        IOUtils.skipFully(is, 4);//field data address
+        col.fieldLength = is.read();
+        if (col.fieldLength < 0) {
+            throw new TikaException("Field length for column "+headerName+"is 
< 0");
+        } else if (col.fieldLength > DBFReader.MAX_FIELD_LENGTH) {
+            throw new TikaException("Field length ("+col.fieldLength+") is 
greater than DBReader.MAX_FIELD_LENGTH ("+
+                    DBFReader.MAX_FIELD_LENGTH+")");
+        }
+        col.decimalCount = is.read();
+        IOUtils.skipFully(is, 14); //TODO: might have useful info in some 
versions
+        return col;
+    }
+
+    DBFColumnHeader[] getCols() {
+        return cols;
+    }
+
+    int getNumRecords() {
+        return numRecords;
+    }
+
+    Calendar getLastModified() {
+        return lastModified;
+    }
+
+    DBFReader.Version getVersion() {
+        return version;
+    }
+
+    @Override
+    public String toString() {
+        return "DBFFileHeader{" +
+                "lastModified=" + lastModified +
+                ", numRecords=" + numRecords +
+                ", numBytesInHeader=" + numBytesInHeader +
+                ", numBytesInRecord=" + numBytesInRecord +
+                ", cols=" + Arrays.toString(cols) +
+                '}';
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
new file mode 100644
index 0000000..35f3b12
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dbf;
+
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+
+/**
+ * This is a Tika wrapper around the DBFReader.
+ * <p>
+ * This reads many dbase3 file variants (not DBASE 7, yet!).
+ * <p>
+ * It caches the first 10 rows and then runs encoding dectection
+ * on the "character" cells.
+ */
+public class DBFParser extends AbstractParser {
+
+    public static final String DBF_VERSION_MIME_ATTRIBUTE = "dbf_version";
+    private static final int ROWS_TO_BUFFER_FOR_CHARSET_DETECTION = 10;
+    private static final int MAX_CHARS_FOR_CHARSET_DETECTION = 20000;
+    private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("x-dbf"));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata,
+                      ParseContext context) throws IOException, SAXException, 
TikaException {
+        DBFReader reader = DBFReader.open(stream);
+        DBFFileHeader header = reader.getHeader();
+
+        metadata.set(Metadata.CONTENT_TYPE, "application/x-dbf; "+
+                DBF_VERSION_MIME_ATTRIBUTE+"="+header.getVersion().getName());
+
+        //insert metadata here
+        Calendar lastModified = header.getLastModified();
+        if (lastModified != null) {
+            metadata.set(TikaCoreProperties.MODIFIED, lastModified);
+        }
+
+        //buffer first X rows for charset detection
+        List<DBFRow> firstRows = new LinkedList<>();
+        DBFRow row = reader.next();
+        int i = 0;
+        while(row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) {
+            firstRows.add(row.deepCopy());
+            row = reader.next();
+        }
+
+        Charset charset = getCharset(firstRows, header);
+        metadata.set(Metadata.CONTENT_ENCODING, charset.toString());
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.startElement("table");
+        xhtml.startElement("thead");
+        for (DBFColumnHeader col : header.getCols()) {
+            xhtml.startElement("th");
+            xhtml.characters(col.name);
+            xhtml.endElement("th");
+        }
+        xhtml.endElement("thead");
+
+        xhtml.startElement("tbody");
+
+        //now write cached rows
+        while (firstRows.size() > 0) {
+            DBFRow cachedRow = firstRows.remove(0);
+            writeRow(cachedRow, charset, xhtml);
+        }
+
+        //now continue with rest
+        while (row != null) {
+            writeRow(row, charset, xhtml);
+            row = reader.next();
+        }
+        xhtml.endElement("tbody");
+        xhtml.endElement("table");
+        xhtml.endDocument();
+    }
+
+    private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header) 
throws IOException,
+            TikaException {
+        //TODO: potentially use codepage info in the header
+        Charset charset = DEFAULT_CHARSET;
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        for (DBFRow row : firstRows) {
+            for (DBFCell cell : row.cells) {
+                if (cell.getColType().equals(DBFColumnHeader.ColType.C)) {
+                    byte[] bytes = cell.getBytes();
+                    bos.write(bytes);
+                    if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) {
+                        break;
+                    }
+                }
+            }
+        }
+        byte[] bytes = bos.toByteArray();
+        if (bytes.length > 20) {
+            charset = new 
AutoDetectReader(TikaInputStream.get(bytes)).getCharset();
+        }
+        return charset;
+    }
+
+    private void writeRow(DBFRow row, Charset charset, XHTMLContentHandler 
xhtml) throws SAXException {
+        xhtml.startElement("tr");
+        for (DBFCell cell : row.cells) {
+            xhtml.startElement("td");
+            xhtml.characters(cell.getString(charset));
+            xhtml.endElement("td");
+        }
+        xhtml.endElement("tr");
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java
new file mode 100644
index 0000000..961244a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dbf;
+
+import org.apache.tika.exception.TikaException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * This reads many dbase3 file variants (not DBASE 7, yet!).
+ * This parses the header on open.  The client
+ * should get a row and then iterate until next() returns null.
+ * Be careful to deepCopy the row (if caching) because the row
+ * is mutable and will change as the reader iterates over new rows.
+ * <p>
+ * This is based on: <a 
href="http://web.archive.org/web/20150323061445/http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm";>
+ *     http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm</a>
+ * <p>
+ * This is designed to separate out Tika-specific code so that it can
+ * be copied/pasted as a standalone if desired.
+ */
+
+class DBFReader {
+
+    public static final int MAX_FIELD_LENGTH = 66000;
+    public static boolean STRICT = false;
+
+
+    enum Version {
+
+        FOXBASE(0x02, "FoxBASE"),
+        FOXBASE_PLUS(0x03, "FoxBASE_plus"),
+        VISUAL_FOXPRO(0x30, "Visual_FoxPro"),
+        VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro_autoincrement"),
+        VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro_with_Varchar_or_Varbinary"),
+        DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL_table"),
+        DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL_system"),
+        FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus_with_memo"),
+        DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV_with_memo"),
+        DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL_table_with_memo"),
+        FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x_with_memo"),
+        HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz_with_SMT_memo"),
+        FOXBASE2(0xFB, "FoxBASE");
+
+        private final int id;
+        private final String name;
+
+        Version(int id, String name) {
+            this.id = id;
+            this.name = name;
+        }
+
+        int getId() {
+            return id;
+        }
+
+        String getName() {
+            return name;
+        }
+    };
+
+    private static final Map<Integer, Version> VERSION_MAP = new 
ConcurrentHashMap<>();
+    static {
+        for (Version version : Version.values()) {
+            VERSION_MAP.put(version.id, version);
+        }
+    }
+
+    static DBFReader open(InputStream is) throws IOException, TikaException {
+        return new DBFReader(is);
+    }
+
+    //can return null!
+    static Version getVersion(int b) {
+        return VERSION_MAP.get(b);
+    }
+
+    private final DBFFileHeader header;
+    private final InputStream is;
+    private DBFRow currRow = null;
+    private Charset charset = StandardCharsets.US_ASCII;
+
+    private DBFReader(InputStream is) throws IOException, TikaException {
+        header = DBFFileHeader.parse(is);
+        this.is = is;
+        currRow = new DBFRow(header);
+    }
+
+
+    /**
+     * Iterate through the rows with this.
+     * <p>
+     * Be careful: the reader reuses the row!  Make sure to call deep copy
+     * if you are buffering rows.
+     *
+     * @return
+     * @throws IOException
+     * @throws TikaException
+     */
+    DBFRow next() throws IOException, TikaException {
+        if (fillRow(currRow)) {
+            return currRow;
+        }
+        return null;
+    }
+
+    //returns whether or not some content was read.
+    //it might not be complete!
+    private boolean fillRow(DBFRow row) throws IOException, TikaException {
+        if (row == null) {
+            return false;
+        }
+        DBFCell[] cells = row.cells;
+        int isDeletedByte = is.read();
+        boolean isDeleted = false;
+        if (isDeletedByte == 32) {
+            //all ok
+        } else if (isDeletedByte == 42) {//asterisk
+            isDeleted = true;
+        } else if (isDeletedByte == 26) {//marker for end of dbf file
+            return false;
+        } else if (isDeletedByte == -1) {//truncated file
+            if (DBFReader.STRICT) {
+                throw new IOException("EOF reached too early");
+            }
+            return false;
+        } else {
+            throw new TikaException("Expecting space or asterisk at beginning 
of record, not:" + isDeletedByte);
+        }
+        row.setDeleted(isDeleted);
+
+        boolean readSomeContent = false;
+        for (int i = 0; i < cells.length; i++) {
+            if (cells[i].read(is)) {
+                readSomeContent = true;
+            }
+        }
+        return readSomeContent;
+    }
+
+    public DBFFileHeader getHeader() {
+        return header;
+    }
+
+    public Charset getCharset() {
+        return charset;
+    }
+
+    /**
+     * removes trailing 0 from byte array
+     *
+     * @param bytes
+     * @return
+     */
+    public static byte[] trim(byte[] bytes) {
+        int end = bytes.length - 1;
+        for (int i = end; i > -1; i--) {
+            if (bytes[i] != 0) {
+                end = i;
+                break;
+            }
+        }
+        if (end == bytes.length - 1) {
+            return bytes;
+        }
+        byte[] ret = new byte[end + 1];
+        System.arraycopy(bytes, 0, ret, 0, end + 1);
+        return ret;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFRow.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFRow.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFRow.java
new file mode 100644
index 0000000..6a400d7
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFRow.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dbf;
+
+import java.util.Arrays;
+
+class DBFRow {
+
+    DBFCell[] cells;
+    private boolean isDeleted = false;
+
+    DBFRow(DBFFileHeader header) {
+        cells = new DBFCell[header.getCols().length];
+        for (int i = 0; i < cells.length; i++) {
+            DBFColumnHeader columnHeader = header.getCols()[i];
+            cells[i] = new DBFCell(columnHeader.getColType(),
+                    columnHeader.fieldLength,
+                    columnHeader.decimalCount);
+        }
+    }
+
+    private DBFRow() {}
+
+    void setDeleted(boolean deleted) {
+        isDeleted = deleted;
+    }
+
+    boolean isDeleted() {
+        return isDeleted;
+    }
+
+    DBFRow deepCopy() {
+        DBFRow row = new DBFRow();
+        row.isDeleted = this.isDeleted;
+        row.cells = new DBFCell[cells.length];
+        for (int i = 0; i < cells.length; i++) {
+            row.cells[i] = cells[i].deepCopy();
+        }
+        return row;
+    }
+
+    @Override
+    public String toString() {
+        return "DBFRow{" +
+                "cells=" + Arrays.toString(cells) +
+                '}';
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index e3238ef..acb0224 100644
--- 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -70,4 +70,5 @@ org.apache.tika.parser.geoinfo.GeographicInformationParser
 org.apache.tika.parser.geo.topic.GeoParser
 org.apache.tika.parser.external.CompositeExternalParser
 org.apache.tika.parser.journal.JournalParser
-org.apache.tika.parser.image.ICNSParser
\ No newline at end of file
+org.apache.tika.parser.image.ICNSParser
+org.apache.tika.parser.dbf.DBFParser
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
new file mode 100644
index 0000000..a531c55
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dbf;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import static junit.framework.TestCase.assertEquals;
+import static org.junit.Assert.fail;
+
+public class DBFParserTest extends TikaTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        XMLResult r = getXML("testDBF.dbf");
+        assertEquals("application/x-dbf; dbf_version=FoxBASE_plus", 
r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("2016-05-24T00:00:00Z", 
r.metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("UTF-8", r.metadata.get(Metadata.CONTENT_ENCODING));
+
+        String xml = r.xml.replaceAll("[\\t\\r\\n]", " ");
+        //header
+        assertContains("<thead> <th>TEXT_FIELD</th> <th>NUMERIC_FI</th> 
<th>DATE_FIELD</th></thead>",
+                xml);
+        //look for contents
+        assertContains("æ®ææ¯é¡¿å¤§å¦", xml);
+        assertContains("\u0627\u0645\u0639\u0629", xml);
+        assertContains("05/26/2016", xml);
+        assertContains("<td>4.0</td>", xml);
+        //make sure there is no problem around row 10
+        //where we're buffering
+        assertContains("<td>8.0</td>", xml);
+        assertContains("<td>9.0</td>", xml);
+        assertContains("<td>10.0</td>", xml);
+        assertContains("<td>11.0</td>", xml);
+        assertContains("<td>licour</td>", xml);
+    }
+
+    @Test
+    public void testGB18030Encoded() throws Exception {
+        XMLResult r = getXML("testDBF_gb18030.dbf");
+        assertEquals("application/x-dbf; dbf_version=FoxBASE_plus", 
r.metadata.get(Metadata.CONTENT_TYPE));
+        assertContains("è½ç¶è¯¥", r.xml);
+    }
+
+    @Test
+    public void testTruncated() throws Exception {
+        Parser p = new DBFParser();
+        //should throw exception for truncation in header
+        for (int i = 1; i < 129; i++) {
+            try {
+                XMLResult r = getXML(truncate("testDBF.dbf", i), p, new 
Metadata());
+                fail("Should have thrown exception for truncation in header: 
"+i);
+            } catch (IOException|TikaException e) {
+                //ok -- expected
+            } catch (Throwable e) {
+                fail("Should only throw IOExceptions or TikaExceptions");
+            }
+        }
+        //default don't throw exception for truncation while reading body
+        for (int i = 129; i < 204; i++) {
+            try {
+                XMLResult r = getXML(truncate("testDBF.dbf", i), p, new 
Metadata());
+            } catch (IOException|TikaException e) {
+                fail("Shouldn't have thrown exception for truncation while 
reading cells: "+i);
+                e.printStackTrace();
+            }
+        }
+        try {
+            DBFReader.STRICT = true;
+            //if strict is true throw exception for truncation in body
+            for (int i = 129; i < 204; i++) {
+                try {
+                    XMLResult r = getXML(truncate("testDBF.dbf", i), p, new 
Metadata());
+                    fail("Should have thrown exception for truncation while 
reading cells: " + i);
+                } catch (IOException | TikaException e) {
+                }
+            }
+        } finally {
+            //reset for other tests
+            DBFReader.STRICT = false;
+        }
+    }
+
+    @Test
+    public void testSpecificTruncated() throws Exception {
+        XMLResult r = getXML(truncate("testDBF.dbf", 781), new 
AutoDetectParser(), new Metadata());
+        String xml = r.xml.replaceAll("[\\t\\r\\n]", " ");
+
+        //if you don't keep track of bytes read, you could get content from 
prev row
+        assertNotContained("holt red hath in every", xml);
+        assertNotContained("<td>holt</td> <td>18.0</td>", xml);
+        //check that the last row ends with holt but is correctly formatted
+        assertContains("<td>holt</td> <td /> <td /></tr>", xml);
+    }
+
+    @Test
+    public void testVariants() throws Exception {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        try (InputStream is = 
getResourceAsStream("/test-documents/testDBF.dbf")) {
+            IOUtils.copy(is, bos);
+        }
+        byte[] bytes = bos.toByteArray();
+
+        for (DBFReader.Version version : DBFReader.Version.values()) {
+            //this cast happens to work because of the range of possible values
+            bytes[0] = (byte)version.getId();
+            XMLResult r = getXML(TikaInputStream.get(bytes), new 
AutoDetectParser(), new Metadata());
+            assertEquals("application/x-dbf; "+
+                    
DBFParser.DBF_VERSION_MIME_ATTRIBUTE+"="+version.getName(), 
r.metadata.get(Metadata.CONTENT_TYPE));
+        }
+    }
+
+    InputStream truncate(String testFileName, int length) throws IOException {
+        byte[] bytes = new byte[length];
+        try (InputStream is = 
getResourceAsStream("/test-documents/"+testFileName)) {
+            IOUtils.readFully(is, bytes);
+        }
+        return new ByteArrayInputStream(bytes);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/test/resources/test-documents/testDBF.dbf
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testDBF.dbf 
b/tika-parsers/src/test/resources/test-documents/testDBF.dbf
new file mode 100644
index 0000000..54b8aca
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testDBF.dbf differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e74f6637/tika-parsers/src/test/resources/test-documents/testDBF_gb18030.dbf
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testDBF_gb18030.dbf 
b/tika-parsers/src/test/resources/test-documents/testDBF_gb18030.dbf
new file mode 100644
index 0000000..fdf20a5
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testDBF_gb18030.dbf differ

[1/3] tika git commit: TIKA-1513 -- add mime detection and parsing for dbf files. Thanks to Nick C for the mime definition and Luis Filipe Nassif for collaboration.

Reply via email to