Repository: tika Updated Branches: refs/heads/2.x 8d24e07fb -> e855648af
TIKA-1513 -- update mime type according to Nick Burch's recommendation, other small import clean up Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/15ec358c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/15ec358c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/15ec358c Branch: refs/heads/2.x Commit: 15ec358c44867adc44ab0431960d565b3d8a3e2c Parents: 8d24e07 Author: tballison <[email protected]> Authored: Thu May 26 10:04:55 2016 -0400 Committer: tballison <[email protected]> Committed: Thu May 26 10:04:55 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/parser/dbf/DBFParser.java | 7 +-- .../org/apache/tika/parser/dbf/DBFReader.java | 59 +++++++++++++------- .../apache/tika/parser/dbf/DBFParserTest.java | 7 +-- 3 files changed, 43 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/15ec358c/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java index 7200da3..f8fa1a2 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java @@ -49,7 +49,6 @@ import java.util.Set; */ public class DBFParser extends AbstractParser { - public static final String DBF_VERSION_MIME_ATTRIBUTE = "dbf_version"; private static final int ROWS_TO_BUFFER_FOR_CHARSET_DETECTION = 10; private static final int MAX_CHARS_FOR_CHARSET_DETECTION = 20000; private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; @@ -67,9 +66,7 @@ public class DBFParser extends AbstractParser { ParseContext context) throws IOException, SAXException, TikaException { DBFReader reader = DBFReader.open(stream); DBFFileHeader header = reader.getHeader(); - - metadata.set(Metadata.CONTENT_TYPE, "application/x-dbf; "+ - DBF_VERSION_MIME_ATTRIBUTE+"="+header.getVersion().getName()); + metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString()); //insert metadata here Calendar lastModified = header.getLastModified(); @@ -81,7 +78,7 @@ public class DBFParser extends AbstractParser { List<DBFRow> firstRows = new LinkedList<>(); DBFRow row = reader.next(); int i = 0; - while(row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) { + while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) { firstRows.add(row.deepCopy()); row = reader.next(); } http://git-wip-us.apache.org/repos/asf/tika/blob/15ec358c/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFReader.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFReader.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFReader.java index 961244a..674e238 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFReader.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/dbf/DBFReader.java @@ -33,7 +33,7 @@ import java.util.concurrent.ConcurrentHashMap; * is mutable and will change as the reader iterates over new rows. * <p> * This is based on: <a href="http://web.archive.org/web/20150323061445/http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm"> - * http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm</a> + * http://ulisse.elettra.trieste.it/services/doc/dbase/DBFstruct.htm</a> * <p> * This is designed to separate out Tika-specific code so that it can * be copied/pasted as a standalone if desired. @@ -46,39 +46,56 @@ class DBFReader { enum Version { - - FOXBASE(0x02, "FoxBASE"), - FOXBASE_PLUS(0x03, "FoxBASE_plus"), - VISUAL_FOXPRO(0x30, "Visual_FoxPro"), - VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro_autoincrement"), - VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro_with_Varchar_or_Varbinary"), - DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL_table"), - DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL_system"), - FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus_with_memo"), - DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV_with_memo"), - DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL_table_with_memo"), - FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x_with_memo"), - HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz_with_SMT_memo"), - FOXBASE2(0xFB, "FoxBASE"); + FOXBASE(0x02, "FoxBASE", ""), + FOXBASE_PLUS(0x03, "FoxBASE_plus", ""), + VISUAL_FOXPRO(0x30, "Visual_FoxPro", ""), + VISUAL_FOXPRO_AUTOINCREMENT(0x31, "Visual_FoxPro", "autoincrement"), + VISUAL_FOXPRO_VAR(0x32, "Visual_FoxPro", "Varchar_or_Varbinary"), + DBASE_IV_SQL_TABLE(0x43, "dBASE_IV_SQL", "table"), + DBASE_IV_SQL_SYSTEM(0x63, "dBASE_IV_SQL", "system"), + FOX_BASE_PLUS_WITH_MEMO(0x83, "FoxBASE_plus", "memo"), + DBASE_IV_WITH_MEMO(0x8B, "dBASE_IV", "memo"), + DBASE_IV_SQL_TABLE_WITH_MEMO(0xCB, "dBASE_IV_SQL", "table_with_memo"), + FOXPRO_2x_WITH_MEMO(0xF5, "FoxPro_2.x", "memo"), + HIPER_SIZ_WITH_SMT_MEMO(0xE5, "HiPer-Siz", "SMT_memo"), + FOXBASE2(0xFB, "FoxBASE", ""); private final int id; - private final String name; + private final String format; + private final String type; - Version(int id, String name) { + Version(int id, String format, String type) { this.id = id; - this.name = name; + this.format = format; + this.type = type; } int getId() { return id; } - String getName() { - return name; + String getFormat() { + return format; + } + + String getType() { + return type; } - }; + + String getFullMimeString() { + StringBuilder sb = new StringBuilder(); + sb.append("application/x-dbf; ").append("format=").append(getFormat()); + if (!"".equals(type)) { + sb.append("; type=").append(getType()); + } + return sb.toString(); + } + } + + ; private static final Map<Integer, Version> VERSION_MAP = new ConcurrentHashMap<>(); + static { for (Version version : Version.values()) { VERSION_MAP.put(version.id, version); http://git-wip-us.apache.org/repos/asf/tika/blob/15ec358c/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java index a531c55..202c8c8 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/dbf/DBFParserTest.java @@ -39,7 +39,7 @@ public class DBFParserTest extends TikaTest { @Test public void testBasic() throws Exception { XMLResult r = getXML("testDBF.dbf"); - assertEquals("application/x-dbf; dbf_version=FoxBASE_plus", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(DBFReader.Version.FOXBASE_PLUS.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE)); assertEquals("2016-05-24T00:00:00Z", r.metadata.get(TikaCoreProperties.MODIFIED)); assertEquals("UTF-8", r.metadata.get(Metadata.CONTENT_ENCODING)); @@ -64,7 +64,7 @@ public class DBFParserTest extends TikaTest { @Test public void testGB18030Encoded() throws Exception { XMLResult r = getXML("testDBF_gb18030.dbf"); - assertEquals("application/x-dbf; dbf_version=FoxBASE_plus", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(DBFReader.Version.FOXBASE_PLUS.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE)); assertContains("è½ç¶è¯¥", r.xml); } @@ -131,8 +131,7 @@ public class DBFParserTest extends TikaTest { //this cast happens to work because of the range of possible values bytes[0] = (byte)version.getId(); XMLResult r = getXML(TikaInputStream.get(bytes), new AutoDetectParser(), new Metadata()); - assertEquals("application/x-dbf; "+ - DBFParser.DBF_VERSION_MIME_ATTRIBUTE+"="+version.getName(), r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(version.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE)); } }
