Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java Sun Aug 31 19:36:36 2014 @@ -23,6 +23,7 @@ import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Date; import java.util.HashMap; +import java.util.Locale; import java.util.Set; import java.util.TimeZone; @@ -160,7 +161,7 @@ public class IptcAnpaParser implements P } int msgsize = is.read(buf); // read in at least the full data - String message = (new String(buf)).toLowerCase(); + String message = (new String(buf, "UTF-8")).toLowerCase(Locale.ROOT); // these are not if-then-else, because we want to go from most common // and fall through to least. this is imperfect, as these tags could // show up in other agency stories, but i can't find a spec or any @@ -590,7 +591,7 @@ public class IptcAnpaParser implements P --read; } } - if (tmp_line.toLowerCase().startsWith("by") || longline.equals("bdy_author")) { + if (tmp_line.toLowerCase(Locale.ROOT).startsWith("by") || longline.equals("bdy_author")) { longkey = "bdy_author"; // prepend a space to subsequent line, so it gets parsed consistent with the lead line @@ -608,7 +609,7 @@ public class IptcAnpaParser implements P } else if (FORMAT == this.FMT_IPTC_BLM) { String byline = " by "; - if (tmp_line.toLowerCase().contains(byline)) { + if (tmp_line.toLowerCase(Locale.ROOT).contains(byline)) { longkey = "bdy_author"; int term = tmp_line.length(); @@ -617,11 +618,11 @@ public class IptcAnpaParser implements P term = Math.min(term, (tmp_line.contains("\n") ? tmp_line.indexOf("\n") : term)); term = (term > 0 ) ? term : tmp_line.length(); // for bloomberg, the author line sits below their copyright statement - bdy_author += tmp_line.substring(tmp_line.toLowerCase().indexOf(byline) + byline.length(), term) + " "; + bdy_author += tmp_line.substring(tmp_line.toLowerCase(Locale.ROOT).indexOf(byline) + byline.length(), term) + " "; metastarted = true; longline = ((tmp_line.contains("=")) && (!longline.equals(longkey)) ? longkey : ""); } - else if(tmp_line.toLowerCase().startsWith("c.")) { + else if(tmp_line.toLowerCase(Locale.ROOT).startsWith("c.")) { // the author line for bloomberg is a multiline starting with c.2011 Bloomberg News // then containing the author info on the next line if (val_next == TB) { @@ -629,7 +630,7 @@ public class IptcAnpaParser implements P continue; } } - else if(tmp_line.toLowerCase().trim().startsWith("(") && tmp_line.toLowerCase().trim().endsWith(")")) { + else if(tmp_line.toLowerCase(Locale.ROOT).trim().startsWith("(") && tmp_line.toLowerCase(Locale.ROOT).trim().endsWith(")")) { // the author line may have one or more comment lines between the copyright // statement, and the By AUTHORNAME line if (val_next == TB) { @@ -639,7 +640,7 @@ public class IptcAnpaParser implements P } } - else if (tmp_line.toLowerCase().startsWith("eds") || longline.equals("bdy_source")) { + else if (tmp_line.toLowerCase(Locale.ROOT).startsWith("eds") || longline.equals("bdy_source")) { longkey = "bdy_source"; // prepend a space to subsequent line, so it gets parsed consistent with the lead line tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line; @@ -736,14 +737,14 @@ public class IptcAnpaParser implements P // standard reuters format format_in = "HH:mm MM-dd-yy"; } - SimpleDateFormat dfi = new SimpleDateFormat(format_in); + SimpleDateFormat dfi = new SimpleDateFormat(format_in, Locale.ROOT); dfi.setTimeZone(TimeZone.getTimeZone("UTC")); dateunix = dfi.parse(ftr_datetime); } catch (ParseException ep) { // failed, but this will just fall through to setting the date to now } - SimpleDateFormat dfo = new SimpleDateFormat(format_out); + SimpleDateFormat dfo = new SimpleDateFormat(format_out, Locale.ROOT); dfo.setTimeZone(TimeZone.getTimeZone("UTC")); ftr_datetime = dfo.format(dateunix); }
Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java Sun Aug 31 19:36:36 2014 @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.iwork; +import java.util.Locale; + /** * Utility class to allow for conversion from an integer to Roman numerals * or alpha-numeric symbols in line with Pages auto numbering formats. @@ -44,7 +46,7 @@ package org.apache.tika.parser.iwork; } public static String asAlphaNumericLower(int i) { - return asAlphaNumeric(i).toLowerCase(); + return asAlphaNumeric(i).toLowerCase(Locale.ROOT); } /* @@ -73,7 +75,7 @@ package org.apache.tika.parser.iwork; } public static String asRomanNumeralsLower(int i) { - return asRomanNumerals(i).toLowerCase(); + return asRomanNumerals(i).toLowerCase(Locale.ROOT); } private static int i2r(StringBuffer sbuff, int i, Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Sun Aug 31 19:36:36 2014 @@ -26,7 +26,11 @@ import org.apache.james.mime4j.dom.addre import org.apache.james.mime4j.dom.address.AddressList; import org.apache.james.mime4j.dom.address.Mailbox; import org.apache.james.mime4j.dom.address.MailboxList; -import org.apache.james.mime4j.dom.field.*; +import org.apache.james.mime4j.dom.field.AddressListField; +import org.apache.james.mime4j.dom.field.DateTimeField; +import org.apache.james.mime4j.dom.field.MailboxListField; +import org.apache.james.mime4j.dom.field.ParsedField; +import org.apache.james.mime4j.dom.field.UnstructuredField; import org.apache.james.mime4j.field.LenientFieldParser; import org.apache.james.mime4j.parser.ContentHandler; import org.apache.james.mime4j.stream.BodyDescriptor; @@ -141,8 +145,7 @@ class MailContentHandler implements Cont /** * Header for the whole message or its parts * - * @see http - * ://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ + * @see http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ * Field.html **/ public void field(Field field) throws MimeException { Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java Sun Aug 31 19:36:36 2014 @@ -86,7 +86,7 @@ public class MatParser extends AbstractP } // Get endian indicator from header file - String endianBytes = new String(hdr.getEndianIndicator()); // Retrieve endian bytes and convert to string + String endianBytes = new String(hdr.getEndianIndicator(), "UTF-8"); // Retrieve endian bytes and convert to string String endianCode = String.valueOf(endianBytes.toCharArray()); // Convert bytes to characters to string metadata.set("endian", endianCode); Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Sun Aug 31 19:36:36 2014 @@ -167,7 +167,7 @@ public class MboxParser extends Abstract return; // ignore malformed header lines } - String headerTag = headerMatcher.group(1).toLowerCase(); + String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT); String headerContent = headerMatcher.group(2); if (headerTag.equalsIgnoreCase("From")) { Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Sun Aug 31 19:36:36 2014 @@ -140,7 +140,7 @@ public class OutlookPSTParser extends Ab mailMetadata.set("priority", valueOf(pstMail.getPriority())); mailMetadata.set("flagged", valueOf(pstMail.isFlagged())); - byte[] mailContent = pstMail.getBody().getBytes(); + byte[] mailContent = pstMail.getBody().getBytes("UTF-8"); embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true); } Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Sun Aug 31 19:36:36 2014 @@ -20,7 +20,18 @@ import java.io.IOException; import java.util.HashSet; import org.apache.poi.hslf.HSLFSlideShow; -import org.apache.poi.hslf.model.*; +import org.apache.poi.hslf.model.Comment; +import org.apache.poi.hslf.model.HeadersFooters; +import org.apache.poi.hslf.model.MasterSheet; +import org.apache.poi.hslf.model.Notes; +import org.apache.poi.hslf.model.OLEShape; +import org.apache.poi.hslf.model.Picture; +import org.apache.poi.hslf.model.Shape; +import org.apache.poi.hslf.model.Slide; +import org.apache.poi.hslf.model.Table; +import org.apache.poi.hslf.model.TableCell; +import org.apache.poi.hslf.model.TextRun; +import org.apache.poi.hslf.model.TextShape; import org.apache.poi.hslf.usermodel.ObjectData; import org.apache.poi.hslf.usermodel.PictureData; import org.apache.poi.hslf.usermodel.SlideShow; Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Sun Aug 31 19:36:36 2014 @@ -20,6 +20,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.text.ParseException; import java.util.Date; +import java.util.Locale; import org.apache.poi.hmef.attribute.MAPIRtfAttribute; import org.apache.poi.hsmf.MAPIMessage; @@ -126,7 +127,7 @@ public class OutlookExtractor extends Ab String[] headers = msg.getHeaders(); if(headers != null && headers.length > 0) { for(String header: headers) { - if(header.toLowerCase().startsWith("date:")) { + if(header.toLowerCase(Locale.ROOT).startsWith("date:")) { String date = header.substring(header.indexOf(':')+1).trim(); // See if we can parse it as a normal mail date Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Sun Aug 31 19:36:36 2014 @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -233,7 +234,7 @@ public class WordExtractor extends Abstr CharacterRun cr = p.getCharacterRun(j); // FIELD_BEGIN_MARK: - if (cr.text().getBytes()[0] == 0x13) { + if (cr.text().getBytes("UTF-8")[0] == 0x13) { Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); // 58 is an embedded document // 56 is a document link @@ -548,7 +549,7 @@ public class WordExtractor extends Abstr tag = "h" + Math.min(num, 6); } else { styleClass = styleName.replace(' ', '_'); - styleClass = styleClass.substring(0,1).toLowerCase() + + styleClass = styleClass.substring(0,1).toLowerCase(Locale.ROOT) + styleClass.substring(1); } Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Sun Aug 31 19:36:36 2014 @@ -217,11 +217,10 @@ public abstract class AbstractOOXMLExtra private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid - // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code -// if (part.getSize() >= 0 && part.getSize() < 512*3) { -// // Too small, skip -// return; -// } + if (part.getSize() >= 0 && part.getSize() < 512*3) { + // Too small, skip + return; + } // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java Sun Aug 31 19:36:36 2014 @@ -113,4 +113,30 @@ public class CompositeTagHandler impleme return null; } + public String getAlbumArtist() { + for (ID3Tags tag : tags) { + if (tag.getAlbumArtist() != null) { + return tag.getAlbumArtist(); + } + } + return null; + } + + public String getDisc() { + for (ID3Tags tag : tags) { + if (tag.getDisc() != null) { + return tag.getDisc(); + } + } + return null; + } + + public String getCompilation() { + for (ID3Tags tag : tags) { + if (tag.getCompilation() != null) { + return tag.getCompilation(); + } + } + return null; + } } Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java Sun Aug 31 19:36:36 2014 @@ -18,7 +18,6 @@ package org.apache.tika.parser.mp3; import java.util.List; - /** * Interface that defines the common interface for ID3 tag parsers, * such as ID3v1 and ID3v2.3. @@ -172,12 +171,22 @@ public interface ID3Tags { String getTitle(); + /** + * The Artist for the track + */ String getArtist(); + /** + * The Artist for the overall album / compilation of albums + */ + String getAlbumArtist(); + String getAlbum(); String getComposer(); + String getCompilation(); + /** * Retrieves the comments, if any. * Files may have more than one comment, but normally only @@ -189,9 +198,17 @@ public interface ID3Tags { String getYear(); + /** + * The number of the track within the album / recording + */ String getTrackNumber(); /** + * The number of the disc this belongs to, within the set + */ + String getDisc(); + + /** * Represents a comments in ID3 (especially ID3 v2), where are * made up of several parts */ Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Sun Aug 31 19:36:36 2014 @@ -121,6 +121,30 @@ public class ID3v1Handler implements ID3 } /** + * ID3v1 doesn't have album-wide artists, + * so returns null; + */ + public String getAlbumArtist() { + return null; + } + + /** + * ID3v1 doesn't have disc numbers, + * so returns null; + */ + public String getDisc() { + return null; + } + + /** + * ID3v1 doesn't have compilations, + * so returns null; + */ + public String getCompilation() { + return null; + } + + /** * Returns the identified ISO-8859-1 substring from the given byte buffer. * The return value is the zero-terminated substring retrieved from * between the given start and end positions in the given byte buffer. Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java Sun Aug 31 19:36:36 2014 @@ -39,6 +39,8 @@ public class ID3v22Handler implements ID private String composer; private String genre; private String trackNumber; + private String albumArtist; + private String disc; private List<ID3Comment> comments = new ArrayList<ID3Comment>(); public ID3v22Handler(ID3v2Frame frame) @@ -50,6 +52,8 @@ public class ID3v22Handler implements ID title = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TP1")) { artist = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TP2")) { + albumArtist = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TAL")) { album = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TYE")) { @@ -60,6 +64,8 @@ public class ID3v22Handler implements ID comments.add( getComment(tag.data, 0, tag.data.length) ); } else if (tag.name.equals("TRK")) { trackNumber = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TPA")) { + disc = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TCO")) { genre = extractGenre( getTagString(tag.data, 0, tag.data.length) ); } @@ -129,10 +135,25 @@ public class ID3v22Handler implements ID return trackNumber; } + public String getAlbumArtist() { + return albumArtist; + } + + public String getDisc() { + return disc; + } + + /** + * ID3v22 doesn't have compilations, + * so returns null; + */ + public String getCompilation() { + return null; + } + private class RawV22TagIterator extends RawTagIterator { private RawV22TagIterator(ID3v2Frame frame) { frame.super(3, 3, 1, 0); } } - } Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java Sun Aug 31 19:36:36 2014 @@ -39,6 +39,9 @@ public class ID3v23Handler implements ID private String composer; private String genre; private String trackNumber; + private String albumArtist; + private String disc; + private String compilation; private List<ID3Comment> comments = new ArrayList<ID3Comment>(); public ID3v23Handler(ID3v2Frame frame) @@ -50,6 +53,8 @@ public class ID3v23Handler implements ID title = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TPE1")) { artist = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TPE2")) { + albumArtist = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TALB")) { album = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TYER")) { @@ -60,6 +65,10 @@ public class ID3v23Handler implements ID comments.add( getComment(tag.data, 0, tag.data.length) ); } else if (tag.name.equals("TRCK")) { trackNumber = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TPOS")) { + disc = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TCMP")) { + compilation = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TCON")) { genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) ); } @@ -109,10 +118,21 @@ public class ID3v23Handler implements ID return trackNumber; } + public String getAlbumArtist() { + return albumArtist; + } + + public String getDisc() { + return disc; + } + + public String getCompilation() { + return compilation; + } + private class RawV23TagIterator extends RawTagIterator { private RawV23TagIterator(ID3v2Frame frame) { frame.super(4, 4, 1, 2); } } - } Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java Sun Aug 31 19:36:36 2014 @@ -40,6 +40,9 @@ public class ID3v24Handler implements ID private String composer; private String genre; private String trackNumber; + private String albumArtist; + private String disc; + private String compilation; private List<ID3Comment> comments = new ArrayList<ID3Comment>(); public ID3v24Handler(ID3v2Frame frame) @@ -51,6 +54,8 @@ public class ID3v24Handler implements ID title = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TPE1")) { artist = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TPE2")) { + albumArtist = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TALB")) { album = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TYER")) { @@ -65,6 +70,10 @@ public class ID3v24Handler implements ID comments.add( getComment(tag.data, 0, tag.data.length) ); } else if (tag.name.equals("TRCK")) { trackNumber = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TPOS")) { + disc = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TCMP")) { + compilation = getTagString(tag.data, 0, tag.data.length); } else if (tag.name.equals("TCON")) { genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) ); } @@ -114,10 +123,21 @@ public class ID3v24Handler implements ID return trackNumber; } + public String getAlbumArtist() { + return albumArtist; + } + + public String getDisc() { + return disc; + } + + public String getCompilation() { + return compilation; + } + private class RawV24TagIterator extends RawTagIterator { private RawV24TagIterator(ID3v2Frame frame) { frame.super(4, 4, 1, 2); } } - } Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java Sun Aug 31 19:36:36 2014 @@ -82,7 +82,7 @@ public class LyricsHandler { // size including the LYRICSBEGIN but excluding the // length+LYRICS200 at the end. int length = Integer.parseInt( - new String(tagData, lookat-6, 6) + new String(tagData, lookat-6, 6, "UTF-8") ); String lyrics = new String( Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Sun Aug 31 19:36:36 2014 @@ -76,8 +76,10 @@ public class Mp3Parser extends AbstractP metadata.set(TikaCoreProperties.TITLE, tag.getTitle()); metadata.set(TikaCoreProperties.CREATOR, tag.getArtist()); metadata.set(XMPDM.ARTIST, tag.getArtist()); + metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist()); metadata.set(XMPDM.COMPOSER, tag.getComposer()); metadata.set(XMPDM.ALBUM, tag.getAlbum()); + metadata.set(XMPDM.COMPILATION, tag.getCompilation()); metadata.set(XMPDM.RELEASE_DATE, tag.getYear()); metadata.set(XMPDM.GENRE, tag.getGenre()); metadata.set(XMPDM.DURATION, audioAndTags.duration); @@ -107,12 +109,18 @@ public class Mp3Parser extends AbstractP xhtml.element("p", tag.getArtist()); // ID3v1.1 Track addition + StringBuilder sb = new StringBuilder(); + sb.append(tag.getAlbum()); if (tag.getTrackNumber() != null) { - xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber()); + sb.append(", track ").append(tag.getTrackNumber()); metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber()); - } else { - xhtml.element("p", tag.getAlbum()); } + if (tag.getDisc() != null) { + sb.append(", disc ").append(tag.getDisc()); + metadata.set(XMPDM.DISC_NUMBER, tag.getDisc()); + } + xhtml.element("p", sb.toString()); + xhtml.element("p", tag.getYear()); xhtml.element("p", tag.getGenre()); xhtml.element("p", String.valueOf(audioAndTags.duration)); Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java Sun Aug 31 19:36:36 2014 @@ -31,6 +31,7 @@ import org.apache.tika.io.TikaInputStrea import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMP; import org.apache.tika.metadata.XMPDM; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; @@ -55,7 +56,10 @@ import com.coremedia.iso.boxes.apple.App import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry; import com.googlecode.mp4parser.boxes.apple.AppleAlbumBox; import com.googlecode.mp4parser.boxes.apple.AppleArtistBox; +import com.googlecode.mp4parser.boxes.apple.AppleArtist2Box; import com.googlecode.mp4parser.boxes.apple.AppleCommentBox; +import com.googlecode.mp4parser.boxes.apple.AppleCompilationBox; +import com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox; import com.googlecode.mp4parser.boxes.apple.AppleEncoderBox; import com.googlecode.mp4parser.boxes.apple.AppleGenreBox; import com.googlecode.mp4parser.boxes.apple.AppleNameBox; @@ -217,6 +221,10 @@ public class MP4Parser extends AbstractP addMetadata(TikaCoreProperties.CREATOR, metadata, artist); addMetadata(XMPDM.ARTIST, metadata, artist); + // Album Artist + AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class); + addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2); + // Album AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class); addMetadata(XMPDM.ALBUM, metadata, album); @@ -242,13 +250,27 @@ public class MP4Parser extends AbstractP //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO } + // Disc number + AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class); + if (discNum != null) { + metadata.set(XMPDM.DISC_NUMBER, discNum.getA()); + } + + // Compilation + AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class); + if (compilation != null) { + metadata.set(XMPDM.COMPILATION, (int)compilation.getValue()); + } + // Comment AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class); addMetadata(XMPDM.LOG_COMMENT, metadata, comment); // Encoder AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class); - // addMetadata(XMPDM.???, metadata, encoder); // TODO + if (encoder != null) { + metadata.set(XMP.CREATOR_TOOL, encoder.getValue()); + } // As text Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java Sun Aug 31 19:36:36 2014 @@ -18,6 +18,7 @@ package org.apache.tika.parser.odf; import java.io.IOException; import java.io.StringReader; +import java.util.Locale; import org.apache.tika.sax.ContentHandlerDecorator; import org.xml.sax.Attributes; @@ -87,7 +88,7 @@ public class NSNormalizerContentHandler @Override public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException { - if ((systemId != null && systemId.toLowerCase().endsWith(".dtd")) + if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd")) || DTD_PUBLIC_ID.equals(publicId)) { return new InputSource(new StringReader("")); } else { Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sun Aug 31 19:36:36 2014 @@ -25,6 +25,7 @@ import java.util.Calendar; import java.util.HashSet; import java.util.List; import java.util.ListIterator; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -81,9 +82,10 @@ import org.xml.sax.helpers.AttributesImp class PDF2XHTML extends PDFTextStripper { /** - * format used for signature dates + * Format used for signature dates + * TODO Make this thread-safe */ - private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ"); + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); /** * Maximum recursive depth during AcroForm processing. Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Sun Aug 31 19:36:36 2014 @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.Calendar; import java.util.Collections; import java.util.List; +import java.util.Locale; import java.util.Set; import org.apache.jempbox.xmp.XMPSchema; @@ -204,7 +205,7 @@ public class PDFParser extends AbstractP // Invalid date format, just ignore } try { - Calendar modified = info.getModificationDate(); + Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { @@ -214,7 +215,7 @@ public class PDFParser extends AbstractP // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", - "Keywords", "Producer", "Subject", "Title", "Trapped"); + "Keywords", "Producer", "Subject", "Title", "Trapped"); for(COSName key : info.getDictionary().keySet()) { String name = key.getName(); if(! handledMetadata.contains(name)) { @@ -241,7 +242,7 @@ public class PDFParser extends AbstractP metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); - String version = "A-"+pdfaxmp.getPart()+pdfaxmp.getConformance().toLowerCase(); + String version = "A-"+pdfaxmp.getPart()+pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version ); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString()+"; version=\""+version+"\"" ); Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Sun Aug 31 19:36:36 2014 @@ -20,6 +20,7 @@ package org.apache.tika.parser.pdf; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; +import java.util.Locale; import java.util.Properties; import org.apache.pdfbox.util.PDFTextStripper; @@ -337,9 +338,9 @@ public class PDFParserConfig implements if (p == null){ return defaultMissing; } - if (p.toLowerCase().equals("true")) { + if (p.toLowerCase(Locale.ROOT).equals("true")) { return true; - } else if (p.toLowerCase().equals("false")) { + } else if (p.toLowerCase(Locale.ROOT).equals("false")) { return false; } else { return defaultMissing; Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Sun Aug 31 19:36:36 2014 @@ -67,7 +67,6 @@ public class PackageParser extends Abstr private static final MediaType CPIO = MediaType.application("x-cpio"); private static final MediaType DUMP = MediaType.application("x-tika-unix-dump"); private static final MediaType TAR = MediaType.application("x-tar"); - // Enable this when COMPRESS-267 is fixed, see TIKA-1243 private static final MediaType SEVENZ = MediaType.application("x-7z-compressed"); private static final Set<MediaType> SUPPORTED_TYPES = @@ -127,7 +126,7 @@ public class PackageParser extends Abstr stream.reset(); TikaInputStream tstream = TikaInputStream.get(stream); - // Pending a fix for COMPRESS_269, this bit is a little nasty + // Pending a fix for COMPRESS-269, this bit is a little nasty ais = new SevenZWrapper(new SevenZFile(tstream.getFile())); } else { throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne); Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Sun Aug 31 19:36:36 2014 @@ -22,6 +22,7 @@ import java.io.InputStream; import java.util.Enumeration; import java.util.HashSet; import java.util.Iterator; +import java.util.Locale; import java.util.Set; import java.util.regex.Pattern; @@ -245,11 +246,11 @@ public class ZipContainerDetector implem String docType = coreType.substring(0, coreType.lastIndexOf('.')); // The Macro Enabled formats are a little special - if(docType.toLowerCase().endsWith("macroenabled")) { - docType = docType.toLowerCase() + ".12"; + if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) { + docType = docType.toLowerCase(Locale.ROOT) + ".12"; } - if(docType.toLowerCase().endsWith("macroenabledtemplate")) { + if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) { docType = MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12"); } Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java Sun Aug 31 19:36:36 2014 @@ -24,6 +24,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.util.Locale; import java.util.concurrent.atomic.AtomicInteger; import org.apache.poi.poifs.filesystem.DirectoryNode; @@ -102,9 +103,9 @@ class RTFObjDataParser { //readBytes tests for reading too many bytes byte[] embObjBytes = readBytes(is, dataSz); - if (className.toLowerCase().equals("package")){ + if (className.toLowerCase(Locale.ROOT).equals("package")){ return handlePackage(embObjBytes, metadata); - } else if (className.toLowerCase().equals("pbrush")) { + } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) { //simple bitmap bytes return embObjBytes; } else { Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Sun Aug 31 19:36:36 2014 @@ -29,7 +29,9 @@ import java.nio.charset.CodingErrorActio import java.util.Calendar; import java.util.HashMap; import java.util.LinkedList; +import java.util.Locale; import java.util.Map; +import java.util.TimeZone; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -1339,7 +1341,7 @@ final class TextExtractor { if (inHeader) { if (nextMetaData != null) { if (nextMetaData == TikaCoreProperties.CREATED) { - Calendar cal = Calendar.getInstance(); + Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT); cal.set(year, month-1, day, hour, minute, 0); metadata.set(nextMetaData, cal.getTime()); } else if (nextMetaData.isMultiValuePermitted()) { Modified: tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java (original) +++ tika/branches/1.6/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java Sun Aug 31 19:36:36 2014 @@ -130,7 +130,7 @@ public class FLVParser extends AbstractP int size = input.readUnsignedShort(); byte[] chars = new byte[size]; input.readFully(chars); - return new String(chars); + return new String(chars, "UTF-8"); } private Object readAMFObject(DataInputStream input) throws IOException { Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/TestParsers.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/TestParsers.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/TestParsers.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/TestParsers.java Sun Aug 31 19:36:36 2014 @@ -108,7 +108,8 @@ public class TestParsers extends TikaTes @Test public void testComment() throws Exception { - final String[] extensions = new String[] {"ppt", "pptx", "doc", "docx", "pdf", "rtf"}; + final String[] extensions = new String[] {"ppt", "pptx", "doc", + "docx", "xls", "xlsx", "pdf", "rtf"}; for(String extension : extensions) { verifyComment(extension, "testComment"); } Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/TikaTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Sun Aug 31 19:36:36 2014 @@ -16,6 +16,7 @@ */ package org.apache.tika; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -88,6 +89,10 @@ public abstract class TikaTest { assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); } + public static void assertNotContained(String needle, String haystack) { + assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle)); + } + protected static class XMLResult { public final String xml; public final Metadata metadata; Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java Sun Aug 31 19:36:36 2014 @@ -34,6 +34,7 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import org.apache.tika.embedder.Embedder; @@ -58,7 +59,7 @@ import org.xml.sax.SAXException; public class ExternalEmbedderTest { protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER = - new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"); + new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ROOT); protected static final String DEFAULT_CHARSET = "UTF-8"; private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = "dc:description"; private static final String TEST_TXT_PATH = "/test-documents/testTXT.txt"; Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Sun Aug 31 19:36:36 2014 @@ -728,6 +728,12 @@ public class TestMimeTypes { } @Test + public void testAxCrypt() throws Exception { + // test-TXT.txt encrypted with a key of "tika" + assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt"); + } + + @Test public void testMatroskaDetection() throws Exception { assertType("video/x-matroska", "testMKV.mkv"); // TODO: Need custom detector data detection, see TIKA-1180 Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Sun Aug 31 19:36:36 2014 @@ -389,7 +389,7 @@ public class AutoDetectParserTest { public void testSpecificParserList() throws Exception { AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser()); - InputStream is = new ByteArrayInputStream("test".getBytes()); + InputStream is = new ByteArrayInputStream("test".getBytes("UTF-8")); Metadata metadata = new Metadata(); parser.parse(is, new BodyContentHandler(), metadata, new ParseContext()); Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java Sun Aug 31 19:36:36 2014 @@ -69,7 +69,7 @@ public class TestChmBlockInfo { int indexOfControlData = chmDirListCont.getControlDataIndex(); int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, - ChmConstants.LZXC.getBytes()); + ChmConstants.LZXC.getBytes("UTF-8")); byte[] dir_chunk = null; if (indexOfResetTable > 0) { // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable, Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java Sun Aug 31 19:36:36 2014 @@ -27,6 +27,8 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; +import java.io.UnsupportedEncodingException; + /** * Tests all public methods of the ChmItspHeader * @@ -134,9 +136,9 @@ public class TestChmItspHeader { } @Test - public void testGetSignature() { + public void testGetSignature() throws UnsupportedEncodingException { assertEquals(TestParameters.VP_ISTP_SIGNATURE, new String( - chmItspHeader.getSignature())); + chmItspHeader.getSignature(), "UTF-8")); } @Test Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java Sun Aug 31 19:36:36 2014 @@ -64,7 +64,7 @@ public class TestChmLzxState { ChmConstants.CONTROL_DATA); int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, - ChmConstants.LZXC.getBytes()); + ChmConstants.LZXC.getBytes("UTF-8")); byte[] dir_chunk = null; if (indexOfResetTable > 0) { // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable, Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java Sun Aug 31 19:36:36 2014 @@ -29,6 +29,8 @@ import org.apache.tika.parser.chm.core.C import org.junit.Before; import org.junit.Test; +import java.io.UnsupportedEncodingException; + /** * Tests all public methods of ChmLzxcControlData block */ @@ -60,7 +62,7 @@ public class TestChmLzxcControlData { int indexOfControlData = chmDirListCont.getControlDataIndex(); int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, - ChmConstants.LZXC.getBytes()); + ChmConstants.LZXC.getBytes("UTF-8")); byte[] dir_chunk = null; if (indexOfResetTable > 0) { // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable, @@ -127,16 +129,16 @@ public class TestChmLzxcControlData { } @Test - public void testGetSignature() { + public void testGetSignature() throws UnsupportedEncodingException { assertEquals( - TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes().length, + TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes("UTF-8").length, chmLzxcControlData.getSignature().length); } @Test - public void testGetSignaure() { + public void testGetSignaure() throws UnsupportedEncodingException { assertEquals( - TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes().length, + TestParameters.VP_CONTROL_DATA_SIGNATURE.getBytes("UTF-8").length, chmLzxcControlData.getSignature().length); } Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java Sun Aug 31 19:36:36 2014 @@ -59,7 +59,7 @@ public class TestChmLzxcResetTable { int indexOfControlData = chmDirListCont.getControlDataIndex(); int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, - ChmConstants.LZXC.getBytes()); + ChmConstants.LZXC.getBytes("UTF-8")); byte[] dir_chunk = null; if (indexOfResetTable > 0) { // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable, Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java Sun Aug 31 19:36:36 2014 @@ -25,6 +25,8 @@ import org.apache.tika.parser.chm.core.C import org.junit.Before; import org.junit.Test; +import java.io.UnsupportedEncodingException; + public class TestPmglHeader { ChmPmglHeader chmPmglHeader = null; @@ -44,9 +46,9 @@ public class TestPmglHeader { } @Test - public void testChmPmglHeaderGet() { + public void testChmPmglHeaderGet() throws UnsupportedEncodingException { assertEquals(TestParameters.VP_PMGL_SIGNATURE, new String( - chmPmglHeader.getSignature())); + chmPmglHeader.getSignature(), "UTF-8")); } @Test Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java Sun Aug 31 19:36:36 2014 @@ -62,7 +62,7 @@ public class SourceCodeParserTest extend assertTrue(textContent.length() > 0); assertTrue(textContent.indexOf("html") < 0); - textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes()), sourceCodeParser, createMetadata("text/x-java-source")); + textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes("UTF-8")), sourceCodeParser, createMetadata("text/x-java-source")); assertTrue(textContent.length() > 0); assertTrue(textContent.indexOf("html") < 0); } Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/font/FontParsersTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/font/FontParsersTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/font/FontParsersTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/font/FontParsersTest.java Sun Aug 31 19:36:36 2014 @@ -31,7 +31,13 @@ import org.xml.sax.ContentHandler; import org.apache.tika.io.TikaInputStream; import org.junit.Test; -import static org.apache.tika.parser.font.AdobeFontMetricParser.*; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_NAME; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_FULL_NAME; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_FAMILY_NAME; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_WEIGHT; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_VERSION; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME; +import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_PS_NAME; /** * Test case for parsing various different font files. Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java Sun Aug 31 19:36:36 2014 @@ -16,10 +16,6 @@ */ package org.apache.tika.parser.image; -import java.util.Arrays; -import java.util.GregorianCalendar; -import java.util.Iterator; -import java.util.List; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -32,11 +28,20 @@ import com.drew.metadata.exif.ExifIFD0Di import com.drew.metadata.exif.ExifSubIFDDirectory; import com.drew.metadata.jpeg.JpegCommentDirectory; +import java.util.Arrays; +import java.util.GregorianCalendar; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.TimeZone; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.*; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; public class ImageMetadataExtractorTest { @@ -57,7 +62,7 @@ public class ImageMetadataExtractorTest verify(handler1).supports(JpegCommentDirectory.class); verify(handler1).handle(directory, metadata); } - + @Test public void testExifHandlerSupports() { assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifIFD0Directory.class)); @@ -70,8 +75,11 @@ public class ImageMetadataExtractorTest public void testExifHandlerParseDate() throws MetadataException { ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class); when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true); + GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT); + calendar.setTimeInMillis(0); + calendar.set(2000, 0, 1, 0, 0, 0); when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn( - new GregorianCalendar(2000, 0, 1, 0, 0, 0).getTime()); // jvm default timezone as in Metadata Extractor + calendar.getTime()); // jvm default timezone as in Metadata Extractor Metadata metadata = new Metadata(); new ImageMetadataExtractor.ExifHandler().handle(exif, metadata); @@ -83,8 +91,11 @@ public class ImageMetadataExtractorTest public void testExifHandlerParseDateFallback() throws MetadataException { ExifIFD0Directory exif = mock(ExifIFD0Directory.class); when(exif.containsTag(ExifIFD0Directory.TAG_DATETIME)).thenReturn(true); + GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT); + calendar.setTimeInMillis(0); + calendar.set(1999, 0, 1, 0, 0, 0); when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn( - new GregorianCalendar(1999, 0, 1, 0, 0, 0).getTime()); // jvm default timezone as in Metadata Extractor + calendar.getTime()); // jvm default timezone as in Metadata Extractor Metadata metadata = new Metadata(); new ImageMetadataExtractor.ExifHandler().handle(exif, metadata); Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Sun Aug 31 19:36:36 2014 @@ -17,8 +17,9 @@ package org.apache.tika.parser.microsoft; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.apache.tika.TikaTest.assertContains; +import static org.apache.tika.TikaTest.assertNotContained; import java.io.InputStream; import java.util.Locale; @@ -65,13 +66,13 @@ public class ExcelParserTest { assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE)); String content = handler.toString(); - assertTrue(content.contains("Sample Excel Worksheet")); - assertTrue(content.contains("Numbers and their Squares")); - assertTrue(content.contains("\t\tNumber\tSquare")); - assertTrue(content.contains("9")); - assertFalse(content.contains("9.0")); - assertTrue(content.contains("196")); - assertFalse(content.contains("196.0")); + assertContains("Sample Excel Worksheet", content); + assertContains("Numbers and their Squares", content); + assertContains("\t\tNumber\tSquare", content); + assertContains("9", content); + assertNotContained("9.0", content); + assertContains("196", content); + assertNotContained("196.0", content); } finally { input.close(); } @@ -95,12 +96,12 @@ public class ExcelParserTest { String content = handler.toString(); // Number #,##0.00 - assertTrue(content.contains("1,599.99")); - assertTrue(content.contains("-1,599.99")); + assertContains("1,599.99", content); + assertContains("-1,599.99", content); // Currency $#,##0.00;[Red]($#,##0.00) - assertTrue(content.contains("$1,599.99")); - assertTrue(content.contains("($1,599.99)")); + assertContains("$1,599.99", content); + assertContains("($1,599.99)", content); // Scientific 0.00E+00 // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08 @@ -108,26 +109,29 @@ public class ExcelParserTest { assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08")); // Percentage. - assertTrue(content.contains("2.50%")); + assertContains("2.50%", content); // Excel rounds up to 3%, but that requires Java 1.6 or later if(System.getProperty("java.version").startsWith("1.5")) { - assertTrue(content.contains("2%")); + assertContains("2%", content); } else { - assertTrue(content.contains("3%")); + assertContains("3%", content); } // Time Format: h:mm - assertTrue(content.contains("6:15")); - assertTrue(content.contains("18:15")); + assertContains("6:15", content); + assertContains("18:15", content); // Date Format: d-mmm-yy - assertTrue(content.contains("17-May-07")); + assertContains("17-May-07", content); // Date Format: m/d/yy - assertTrue(content.contains("10/3/09")); + assertContains("10/3/09", content); // Date/Time Format: m/d/yy h:mm - assertTrue(content.contains("1/19/08 4:35")); + assertContains("1/19/08 4:35", content); + + // Fraction (2.5): # ?/? + assertContains("2 1/2", content); // Below assertions represent outstanding formatting issues to be addressed @@ -136,13 +140,10 @@ public class ExcelParserTest { /************************************************************************* // Custom Number (0 "dollars and" .00 "cents") - assertTrue(content.contains("19 dollars and .99 cents")); + assertContains("19 dollars and .99 cents", content); // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy) - assertTrue(content.contains("At 4:20 AM on Thursday May 17, 2007")); - - // Fraction (2.5): # ?/? (TODO Coming in POI 3.8 beta 6) - assertTrue(content.contains("2 1 / 2")); + assertContains("At 4:20 AM on Thursday May 17, 2007", content); **************************************************************************/ } finally { @@ -171,21 +172,21 @@ public class ExcelParserTest { String content = handler.toString(); // The first sheet has a pie chart - assertTrue(content.contains("charttabyodawg")); - assertTrue(content.contains("WhamPuff")); + assertContains("charttabyodawg", content); + assertContains("WhamPuff", content); // The second sheet has a bar chart and some text - assertTrue(content.contains("Sheet1")); - assertTrue(content.contains("Test Excel Spreasheet")); - assertTrue(content.contains("foo")); - assertTrue(content.contains("bar")); - assertTrue(content.contains("fizzlepuff")); - assertTrue(content.contains("whyaxis")); - assertTrue(content.contains("eksaxis")); + assertContains("Sheet1", content); + assertContains("Test Excel Spreasheet", content); + assertContains("foo", content); + assertContains("bar", content); + assertContains("fizzlepuff", content); + assertContains("whyaxis", content); + assertContains("eksaxis", content); // The third sheet has some text - assertTrue(content.contains("Sheet2")); - assertTrue(content.contains("dingdong")); + assertContains("Sheet2", content); + assertContains("dingdong", content); } finally { input.close(); } @@ -206,7 +207,7 @@ public class ExcelParserTest { "application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); - assertTrue(content.contains("Number Formats")); + assertContains("Number Formats", content); } finally { input.close(); } @@ -224,7 +225,7 @@ public class ExcelParserTest { new OfficeParser().parse(input, handler, metadata, context); String content = handler.toString(); - assertTrue(content.contains("Microsoft Works")); + assertContains("Microsoft Works", content); } finally { input.close(); } Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Sun Aug 31 19:36:36 2014 @@ -249,6 +249,6 @@ public class OutlookParserTest { // Make sure we don't have nested html docs assertEquals(2, content.split("<body>").length); - //assertEquals(2, content.split("<\\/body>").length); // TODO Fix + assertEquals(2, content.split("<\\/body>").length); } } Modified: tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1621623&r1=1621622&r2=1621623&view=diff ============================================================================== --- tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original) +++ tika/branches/1.6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Sun Aug 31 19:36:36 2014 @@ -92,20 +92,20 @@ public class PowerPointParserTest extend for(int row=1;row<=3;row++) { //assertContains("·\tBullet " + row, content); //assertContains("\u00b7\tBullet " + row, content); + // TODO OfficeParser fails to extract the bullet symbol assertContains("Bullet " + row, content); } assertContains("Here is a numbered list:", content); for(int row=1;row<=3;row++) { //assertContains(row + ")\tNumber bullet " + row, content); //assertContains(row + ") Number bullet " + row, content); - // TODO: OOXMLExtractor fails to number the bullets: + // TODO: OfficeParser fails to number the bullets: assertContains("Number bullet " + row, content); } for(int row=1;row<=2;row++) { for(int col=1;col<=3;col++) { - // TODO Work out why the upgrade to POI 3.9 broke this test (table text) -// assertContains("Row " + row + " Col " + col, content); + assertContains("Row " + row + " Col " + col, content); } } @@ -153,7 +153,10 @@ public class PowerPointParserTest extend assertEquals(-1, content.indexOf("*")); } - // TODO: once we fix TIKA-712, re-enable this + /** + * TIKA-712 Master Slide Text from PPT and PPTX files + * should be extracted too + */ @Test public void testMasterText() throws Exception { ContentHandler handler = new BodyContentHandler(); @@ -177,7 +180,6 @@ public class PowerPointParserTest extend assertEquals(-1, content.indexOf("*")); } - // TODO: once we fix TIKA-712, re-enable this @Test public void testMasterText2() throws Exception { ContentHandler handler = new BodyContentHandler();
