Author: nick
Date: Wed Aug 13 16:52:39 2014
New Revision: 1617758
URL: http://svn.apache.org/r1617758
Log:
For places formatting numbers in fixed formats, or case-insensitive comparing
Ascii strings, use Locale.ROOT not Locale.getDefault() to ensure predictable
behaviour, and avoid issues in locales like Turkish. TIKA-1387
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Aug
13 16:52:39 2014
@@ -658,7 +658,7 @@ public class TikaCLI {
if (encoding != null) {
return new OutputStreamWriter(output, encoding);
} else if (System.getProperty("os.name")
- .toLowerCase(Locale.getDefault()).startsWith("mac os x")) {
+ .toLowerCase(Locale.ROOT).startsWith("mac os x")) {
// TIKA-324: Override the default encoding on Mac OS X
return new OutputStreamWriter(output, "UTF-8");
} else {
@@ -761,7 +761,7 @@ public class TikaCLI {
// being a CLI program messages should go to the stderr too
//
String msg = String.format(
- Locale.getDefault(),
+ Locale.ROOT,
"Ignoring unexpected exception trying to save embedded
file %s (%s)",
name,
e.getMessage()
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
Wed Aug 13 16:52:39 2014
@@ -97,7 +97,7 @@ public class MagicDetector implements De
|| type.equals("unicodeBE")) {
decoded = decodeString(value, type);
} else if (type.equals("stringignorecase")) {
- decoded = decodeString(value.toLowerCase(Locale.getDefault()),
type);
+ decoded = decodeString(value.toLowerCase(Locale.ROOT), type);
} else if (type.equals("byte")) {
try {
decoded = tmpVal.getBytes("UTF-8");
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
Wed Aug 13 16:52:39 2014
@@ -66,7 +66,7 @@ public class FilenameUtils {
for (char c: name.toCharArray()) {
if (RESERVED.contains(c)) {
- sb.append('%').append((c<16) ? "0" :
"").append(Integer.toHexString(c).toUpperCase(Locale.getDefault()));
+ sb.append('%').append((c<16) ? "0" :
"").append(Integer.toHexString(c).toUpperCase(Locale.ROOT));
} else {
sb.append(c);
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java Wed
Aug 13 16:52:39 2014
@@ -89,7 +89,7 @@ public class DateUtils {
}
private static String doFormatDate(Calendar calendar) {
return String.format(
- Locale.getDefault(),
+ Locale.ROOT,
"%04d-%02d-%02dT%02d:%02d:%02dZ",
calendar.get(Calendar.YEAR),
calendar.get(Calendar.MONTH) + 1,
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
Wed Aug 13 16:52:39 2014
@@ -57,7 +57,7 @@ public class TypeDetectionBenchmark {
tika.detect(new ByteArrayInputStream(content));
}
System.out.printf(
- Locale.getDefault(),
+ Locale.ROOT,
"%6dns per Tika.detect(%s) = %s%n",
System.currentTimeMillis() - start, file, type);
} finally {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
Wed Aug 13 16:52:39 2014
@@ -84,7 +84,7 @@ public class BoilerpipeContentHandler ex
@Override
public String toString() {
- return String.format(Locale.getDefault(), "<%s> of type %s",
localName, elementType);
+ return String.format(Locale.ROOT, "<%s> of type %s", localName,
elementType);
}
public String getUri() {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
Wed Aug 13 16:52:39 2014
@@ -245,7 +245,7 @@ public class ImageMetadataExtractor {
}
static class ExifHandler implements DirectoryHandler {
- private static final SimpleDateFormat DATE_UNSPECIFIED_TZ = new
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.getDefault());
+ private static final SimpleDateFormat DATE_UNSPECIFIED_TZ = new
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ROOT);
public boolean supports(Class<? extends Directory> directoryType) {
return directoryType == ExifIFD0Directory.class ||
directoryType == ExifSubIFDDirectory.class;
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
Wed Aug 13 16:52:39 2014
@@ -161,7 +161,7 @@ public class IptcAnpaParser implements P
}
int msgsize = is.read(buf); // read in at least the
full data
- String message = (new String(buf,
"UTF-8")).toLowerCase(Locale.getDefault());
+ String message = (new String(buf, "UTF-8")).toLowerCase(Locale.ROOT);
// these are not if-then-else, because we want to go from most common
// and fall through to least. this is imperfect, as these tags could
// show up in other agency stories, but i can't find a spec or any
@@ -591,7 +591,7 @@ public class IptcAnpaParser implements P
--read;
}
}
- if (tmp_line.toLowerCase(Locale.getDefault()).startsWith("by")
|| longline.equals("bdy_author")) {
+ if (tmp_line.toLowerCase(Locale.ROOT).startsWith("by") ||
longline.equals("bdy_author")) {
longkey = "bdy_author";
// prepend a space to subsequent line, so it gets parsed
consistent with the lead line
@@ -609,7 +609,7 @@ public class IptcAnpaParser implements P
}
else if (FORMAT == this.FMT_IPTC_BLM) {
String byline = " by ";
- if
(tmp_line.toLowerCase(Locale.getDefault()).contains(byline)) {
+ if (tmp_line.toLowerCase(Locale.ROOT).contains(byline)) {
longkey = "bdy_author";
int term = tmp_line.length();
@@ -618,11 +618,11 @@ public class IptcAnpaParser implements P
term = Math.min(term, (tmp_line.contains("\n") ?
tmp_line.indexOf("\n") : term));
term = (term > 0 ) ? term : tmp_line.length();
// for bloomberg, the author line sits below their
copyright statement
- bdy_author +=
tmp_line.substring(tmp_line.toLowerCase(Locale.getDefault()).indexOf(byline) +
byline.length(), term) + " ";
+ bdy_author +=
tmp_line.substring(tmp_line.toLowerCase(Locale.ROOT).indexOf(byline) +
byline.length(), term) + " ";
metastarted = true;
longline = ((tmp_line.contains("=")) &&
(!longline.equals(longkey)) ? longkey : "");
}
- else
if(tmp_line.toLowerCase(Locale.getDefault()).startsWith("c.")) {
+ else if(tmp_line.toLowerCase(Locale.ROOT).startsWith("c.")) {
// the author line for bloomberg is a multiline starting
with c.2011 Bloomberg News
// then containing the author info on the next line
if (val_next == TB) {
@@ -630,7 +630,7 @@ public class IptcAnpaParser implements P
continue;
}
}
- else
if(tmp_line.toLowerCase(Locale.getDefault()).trim().startsWith("(") &&
tmp_line.toLowerCase(Locale.getDefault()).trim().endsWith(")")) {
+ else
if(tmp_line.toLowerCase(Locale.ROOT).trim().startsWith("(") &&
tmp_line.toLowerCase(Locale.ROOT).trim().endsWith(")")) {
// the author line may have one or more comment lines
between the copyright
// statement, and the By AUTHORNAME line
if (val_next == TB) {
@@ -640,7 +640,7 @@ public class IptcAnpaParser implements P
}
}
- else if
(tmp_line.toLowerCase(Locale.getDefault()).startsWith("eds") ||
longline.equals("bdy_source")) {
+ else if (tmp_line.toLowerCase(Locale.ROOT).startsWith("eds") ||
longline.equals("bdy_source")) {
longkey = "bdy_source";
// prepend a space to subsequent line, so it gets parsed
consistent with the lead line
tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
@@ -737,14 +737,14 @@ public class IptcAnpaParser implements P
// standard reuters format
format_in = "HH:mm MM-dd-yy";
}
- SimpleDateFormat dfi = new SimpleDateFormat(format_in,
Locale.getDefault());
+ SimpleDateFormat dfi = new SimpleDateFormat(format_in,
Locale.ROOT);
dfi.setTimeZone(TimeZone.getTimeZone("UTC"));
dateunix = dfi.parse(ftr_datetime);
}
catch (ParseException ep) {
// failed, but this will just fall through to setting the date
to now
}
- SimpleDateFormat dfo = new SimpleDateFormat(format_out,
Locale.getDefault());
+ SimpleDateFormat dfo = new SimpleDateFormat(format_out,
Locale.ROOT);
dfo.setTimeZone(TimeZone.getTimeZone("UTC"));
ftr_datetime = dfo.format(dateunix);
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
Wed Aug 13 16:52:39 2014
@@ -167,7 +167,7 @@ public class MboxParser extends Abstract
return; // ignore malformed header lines
}
- String headerTag = headerMatcher.group(1).toLowerCase(Locale.getDefault());
+ String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
String headerContent = headerMatcher.group(2);
if (headerTag.equalsIgnoreCase("From")) {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
Wed Aug 13 16:52:39 2014
@@ -127,7 +127,7 @@ public class OutlookExtractor extends Ab
String[] headers = msg.getHeaders();
if(headers != null && headers.length > 0) {
for(String header: headers) {
-
if(header.toLowerCase(Locale.getDefault()).startsWith("date:")) {
+
if(header.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date =
header.substring(header.indexOf(':')+1).trim();
// See if we can parse it as a normal mail date
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Wed Aug 13 16:52:39 2014
@@ -549,7 +549,7 @@ public class WordExtractor extends Abstr
tag = "h" + Math.min(num, 6);
} else {
styleClass = styleName.replace(' ', '_');
- styleClass =
styleClass.substring(0,1).toLowerCase(Locale.getDefault()) +
+ styleClass = styleClass.substring(0,1).toLowerCase(Locale.ROOT) +
styleClass.substring(1);
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
Wed Aug 13 16:52:39 2014
@@ -88,7 +88,7 @@ public class NSNormalizerContentHandler
@Override
public InputSource resolveEntity(String publicId, String systemId)
throws IOException, SAXException {
- if ((systemId != null &&
systemId.toLowerCase(Locale.getDefault()).endsWith(".dtd"))
+ if ((systemId != null &&
systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
|| DTD_PUBLIC_ID.equals(publicId)) {
return new InputSource(new StringReader(""));
} else {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
Wed Aug 13 16:52:39 2014
@@ -246,11 +246,11 @@ public class ZipContainerDetector implem
String docType = coreType.substring(0, coreType.lastIndexOf('.'));
// The Macro Enabled formats are a little special
- if(docType.toLowerCase(Locale.getDefault()).endsWith("macroenabled")) {
- docType = docType.toLowerCase(Locale.getDefault()) + ".12";
+ if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
+ docType = docType.toLowerCase(Locale.ROOT) + ".12";
}
-
if(docType.toLowerCase(Locale.getDefault()).endsWith("macroenabledtemplate")) {
+ if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
docType =
MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
Wed Aug 13 16:52:39 2014
@@ -103,9 +103,9 @@ class RTFObjDataParser {
//readBytes tests for reading too many bytes
byte[] embObjBytes = readBytes(is, dataSz);
- if (className.toLowerCase(Locale.getDefault()).equals("package")){
+ if (className.toLowerCase(Locale.ROOT).equals("package")){
return handlePackage(embObjBytes, metadata);
- } else if
(className.toLowerCase(Locale.getDefault()).equals("pbrush")) {
+ } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
//simple bitmap bytes
return embObjBytes;
} else {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
Wed Aug 13 16:52:39 2014
@@ -1341,7 +1341,7 @@ final class TextExtractor {
if (inHeader) {
if (nextMetaData != null) {
if (nextMetaData == TikaCoreProperties.CREATED) {
- Calendar cal = Calendar.getInstance(TimeZone.getDefault(),
Locale.getDefault());
+ Calendar cal = Calendar.getInstance(TimeZone.getDefault(),
Locale.ROOT);
cal.set(year, month-1, day, hour, minute, 0);
metadata.set(nextMetaData, cal.getTime());
} else if (nextMetaData.isMultiValuePermitted()) {