Repository: tika Updated Branches: refs/heads/master b47f162a5 -> dcaeccbab
TIKA-1513 -- update mime type according to Nick Burch's recommendation, other small import clean up Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dcaeccba Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dcaeccba Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dcaeccba Branch: refs/heads/master Commit: dcaeccbab69519811e0cdf349873ce2b51e6ca10 Parents: b47f162 Author: tballison <[email protected]> Authored: Thu May 26 10:00:30 2016 -0400 Committer: tballison <[email protected]> Committed: Thu May 26 10:00:30 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/parser/dbf/DBFParser.java | 16 +++--- .../org/apache/tika/parser/dbf/DBFReader.java | 58 +++++++++++++------- .../apache/tika/parser/dbf/DBFParserTest.java | 24 +++----- 3 files changed, 54 insertions(+), 44 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/dcaeccba/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java index 6b77d3d..d4f2bda 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFParser.java @@ -16,9 +16,6 @@ */ package org.apache.tika.parser.dbf; -import org.apache.fontbox.encoding.Encoding; -import org.apache.tika.detect.AutoDetectReader; -import org.apache.tika.detect.Detector; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -38,7 +35,11 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.Calendar; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; /** * This is a Tika wrapper around the DBFReader. @@ -50,7 +51,6 @@ import java.util.*; */ public class DBFParser extends AbstractParser { - public static final String DBF_VERSION_MIME_ATTRIBUTE = "dbf_version"; private static final int ROWS_TO_BUFFER_FOR_CHARSET_DETECTION = 10; private static final int MAX_CHARS_FOR_CHARSET_DETECTION = 20000; private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; @@ -68,9 +68,7 @@ public class DBFParser extends AbstractParser { ParseContext context) throws IOException, SAXException, TikaException { DBFReader reader = DBFReader.open(stream); DBFFileHeader header = reader.getHeader(); - - metadata.set(Metadata.CONTENT_TYPE, "application/x-dbf; "+ - DBF_VERSION_MIME_ATTRIBUTE+"="+header.getVersion().getName()); + metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString()); //insert metadata here Calendar lastModified = header.getLastModified(); @@ -82,7 +80,7 @@ public class DBFParser extends AbstractParser { List<DBFRow> firstRows = new LinkedList<>(); DBFRow row = reader.next(); int i = 0; - while(row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) { + while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) { firstRows.add(row.deepCopy()); row = reader.next(); } http://git-wip-us.apache.org/repos/asf/tika/blob/dcaeccba/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java index 961244a..b70898b 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFReader.java @@ -33,7 +33,7 @@ import java.util.concurrent.ConcurrentHashMap; * is mutable and will change as the reader iterates over new rows. * <p> * This is based on: <a href="http://web.archive.org/web/20150323061445/http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm"> - * http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm</a> + * http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm</a> * <p> * This is designed to separate out Tika-specific code so that it can * be copied/pasted as a standalone if desired. @@ -47,38 +47,56 @@ class DBFReader { enum Version { - FOXBASE(0x02, "FoxBASE"), - FOXBASE_PLUS(0x03, "FoxBASE_plus"), - VISUAL_FOXPRO(0x30, "Visual_FoxPro"), - VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro_autoincrement"), - VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro_with_Varchar_or_Varbinary"), - DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL_table"), - DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL_system"), - FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus_with_memo"), - DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV_with_memo"), - DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL_table_with_memo"), - FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x_with_memo"), - HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz_with_SMT_memo"), - FOXBASE2(0xFB, "FoxBASE"); + FOXBASE(0x02, "FoxBASE", ""), + FOXBASE_PLUS(0x03, "FoxBASE_plus", ""), + VISUAL_FOXPRO(0x30, "Visual_FoxPro", ""), + VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro", "autoincrement"), + VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro", "Varchar_or_Varbinary"), + DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL", "table"), + DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL", "system"), + FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus", "memo"), + DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV", "memo"), + DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL", "table_with_memo"), + FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x", "memo"), + HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz", "SMT_memo"), + FOXBASE2(0xFB, "FoxBASE", ""); private final int id; - private final String name; + private final String format; + private final String type; - Version(int id, String name) { + Version(int id, String format, String type) { this.id = id; - this.name = name; + this.format = format; + this.type = type; } int getId() { return id; } - String getName() { - return name; + String getFormat() { + return format; } - }; + + String getType() { + return type; + } + + String getFullMimeString() { + StringBuilder sb = new StringBuilder(); + sb.append("application/x-dbf; ").append("format=").append(getFormat()); + if (!"".equals(type)) { + sb.append("; type=").append(getType()); + } + return sb.toString(); + } + } + + ; private static final Map<Integer, Version> VERSION_MAP = new ConcurrentHashMap<>(); + static { for (Version version : Version.values()) { VERSION_MAP.put(version.id, version); http://git-wip-us.apache.org/repos/asf/tika/blob/dcaeccba/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java index 9d15e44..3ab043b 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java @@ -22,17 +22,12 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; @@ -44,7 +39,7 @@ public class DBFParserTest extends TikaTest { @Test public void testBasic() throws Exception { XMLResult r = getXML("testDBF.dbf"); - assertEquals("application/x-dbf; dbf_version=FoxBASE_plus", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(DBFReader.Version.FOXBASE_PLUS.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE)); assertEquals("2016-05-24T00:00:00Z", r.metadata.get(TikaCoreProperties.MODIFIED)); assertEquals("UTF-8", r.metadata.get(Metadata.CONTENT_ENCODING)); @@ -69,7 +64,7 @@ public class DBFParserTest extends TikaTest { @Test public void testGB18030Encoded() throws Exception { XMLResult r = getXML("testDBF_gb18030.dbf"); - assertEquals("application/x-dbf; dbf_version=FoxBASE_plus", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(DBFReader.Version.FOXBASE_PLUS.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE)); assertContains("è½ç¶è¯¥", r.xml); } @@ -80,8 +75,8 @@ public class DBFParserTest extends TikaTest { for (int i = 1; i < 129; i++) { try { XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata()); - fail("Should have thrown exception for truncation in header: "+i); - } catch (IOException|TikaException e) { + fail("Should have thrown exception for truncation in header: " + i); + } catch (IOException | TikaException e) { //ok -- expected } catch (Throwable e) { fail("Should only throw IOExceptions or TikaExceptions"); @@ -91,8 +86,8 @@ public class DBFParserTest extends TikaTest { for (int i = 129; i < 204; i++) { try { XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata()); - } catch (IOException|TikaException e) { - fail("Shouldn't have thrown exception for truncation while reading cells: "+i); + } catch (IOException | TikaException e) { + fail("Shouldn't have thrown exception for truncation while reading cells: " + i); e.printStackTrace(); } } @@ -134,10 +129,9 @@ public class DBFParserTest extends TikaTest { for (DBFReader.Version version : DBFReader.Version.values()) { //this cast happens to work because of the range of possible values - bytes[0] = (byte)version.getId(); + bytes[0] = (byte) version.getId(); XMLResult r = getXML(TikaInputStream.get(bytes), new AutoDetectParser(), new Metadata()); - assertEquals("application/x-dbf; "+ - DBFParser.DBF_VERSION_MIME_ATTRIBUTE+"="+version.getName(), r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(version.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE)); } } @@ -156,7 +150,7 @@ commented out until we get permission to add the test file InputStream truncate(String testFileName, int length) throws IOException { byte[] bytes = new byte[length]; - try (InputStream is = getResourceAsStream("/test-documents/"+testFileName)) { + try (InputStream is = getResourceAsStream("/test-documents/" + testFileName)) { IOUtils.readFully(is, bytes); } return new ByteArrayInputStream(bytes);
