Author: nick
Date: Wed Aug 13 16:52:39 2014
New Revision: 1617758

URL: http://svn.apache.org/r1617758
Log:
For places formatting numbers in fixed formats, or case-insensitive comparing 
Ascii strings, use Locale.ROOT not Locale.getDefault() to ensure predictable 
behaviour, and avoid issues in locales like Turkish. TIKA-1387

Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
    
tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Aug 
13 16:52:39 2014
@@ -658,7 +658,7 @@ public class TikaCLI {
         if (encoding != null) {
             return new OutputStreamWriter(output, encoding);
         } else if (System.getProperty("os.name")
-                .toLowerCase(Locale.getDefault()).startsWith("mac os x")) {
+                .toLowerCase(Locale.ROOT).startsWith("mac os x")) {
             // TIKA-324: Override the default encoding on Mac OS X
             return new OutputStreamWriter(output, "UTF-8");
         } else {
@@ -761,7 +761,7 @@ public class TikaCLI {
                 // being a CLI program messages should go to the stderr too
                 //
                 String msg = String.format(
-                    Locale.getDefault(),
+                    Locale.ROOT,
                     "Ignoring unexpected exception trying to save embedded 
file %s (%s)",
                     name,
                     e.getMessage()

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java 
Wed Aug 13 16:52:39 2014
@@ -97,7 +97,7 @@ public class MagicDetector implements De
                 || type.equals("unicodeBE")) {
             decoded = decodeString(value, type);
         } else if (type.equals("stringignorecase")) {
-            decoded = decodeString(value.toLowerCase(Locale.getDefault()), 
type);
+            decoded = decodeString(value.toLowerCase(Locale.ROOT), type);
         } else if (type.equals("byte")) {
             try {
                 decoded = tmpVal.getBytes("UTF-8");

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
Wed Aug 13 16:52:39 2014
@@ -66,7 +66,7 @@ public class FilenameUtils {
 
         for (char c: name.toCharArray()) {
             if (RESERVED.contains(c)) {
-                sb.append('%').append((c<16) ? "0" : 
"").append(Integer.toHexString(c).toUpperCase(Locale.getDefault()));
+                sb.append('%').append((c<16) ? "0" : 
"").append(Integer.toHexString(c).toUpperCase(Locale.ROOT));
             } else {
                 sb.append(c);
             }

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java Wed 
Aug 13 16:52:39 2014
@@ -89,7 +89,7 @@ public class DateUtils {
     }
     private static String doFormatDate(Calendar calendar) {
         return String.format(
-                Locale.getDefault(),
+                Locale.ROOT,
                 "%04d-%02d-%02dT%02d:%02d:%02dZ",
                 calendar.get(Calendar.YEAR),
                 calendar.get(Calendar.MONTH) + 1,

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java 
Wed Aug 13 16:52:39 2014
@@ -57,7 +57,7 @@ public class TypeDetectionBenchmark {
                     tika.detect(new ByteArrayInputStream(content));
                 }
                 System.out.printf(
-                        Locale.getDefault(),
+                        Locale.ROOT,
                         "%6dns per Tika.detect(%s) = %s%n",
                         System.currentTimeMillis() - start, file, type);
             } finally {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
 Wed Aug 13 16:52:39 2014
@@ -84,7 +84,7 @@ public class BoilerpipeContentHandler ex
 
         @Override
         public String toString() {
-            return String.format(Locale.getDefault(), "<%s> of type %s", 
localName, elementType);
+            return String.format(Locale.ROOT, "<%s> of type %s", localName, 
elementType);
         }
 
         public String getUri() {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
 Wed Aug 13 16:52:39 2014
@@ -245,7 +245,7 @@ public class ImageMetadataExtractor {
     }
     
     static class ExifHandler implements DirectoryHandler {
-        private static final SimpleDateFormat DATE_UNSPECIFIED_TZ = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.getDefault());
+        private static final SimpleDateFormat DATE_UNSPECIFIED_TZ = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ROOT);
         public boolean supports(Class<? extends Directory> directoryType) {
             return directoryType == ExifIFD0Directory.class || 
                     directoryType == ExifSubIFDDirectory.class;

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iptc/IptcAnpaParser.java
 Wed Aug 13 16:52:39 2014
@@ -161,7 +161,7 @@ public class IptcAnpaParser implements P
          }
          int msgsize = is.read(buf);                // read in at least the 
full data
 
-         String message = (new String(buf, 
"UTF-8")).toLowerCase(Locale.getDefault());
+         String message = (new String(buf, "UTF-8")).toLowerCase(Locale.ROOT);
          // these are not if-then-else, because we want to go from most common
          // and fall through to least.  this is imperfect, as these tags could
          // show up in other agency stories, but i can't find a spec or any
@@ -591,7 +591,7 @@ public class IptcAnpaParser implements P
                      --read;
                   }
                }
-               if (tmp_line.toLowerCase(Locale.getDefault()).startsWith("by") 
|| longline.equals("bdy_author")) {
+               if (tmp_line.toLowerCase(Locale.ROOT).startsWith("by") || 
longline.equals("bdy_author")) {
                   longkey = "bdy_author";
 
                   // prepend a space to subsequent line, so it gets parsed 
consistent with the lead line
@@ -609,7 +609,7 @@ public class IptcAnpaParser implements P
                }
                else if (FORMAT == this.FMT_IPTC_BLM) {
                   String byline = "   by ";
-                  if 
(tmp_line.toLowerCase(Locale.getDefault()).contains(byline)) {
+                  if (tmp_line.toLowerCase(Locale.ROOT).contains(byline)) {
                      longkey = "bdy_author";
 
                      int term = tmp_line.length();
@@ -618,11 +618,11 @@ public class IptcAnpaParser implements P
                      term = Math.min(term, (tmp_line.contains("\n") ? 
tmp_line.indexOf("\n") : term));
                      term = (term > 0 ) ? term : tmp_line.length();
                      // for bloomberg, the author line sits below their 
copyright statement
-                     bdy_author += 
tmp_line.substring(tmp_line.toLowerCase(Locale.getDefault()).indexOf(byline) + 
byline.length(), term) + " ";
+                     bdy_author += 
tmp_line.substring(tmp_line.toLowerCase(Locale.ROOT).indexOf(byline) + 
byline.length(), term) + " ";
                      metastarted = true;
                      longline = ((tmp_line.contains("=")) && 
(!longline.equals(longkey)) ? longkey : "");
                   }
-                  else 
if(tmp_line.toLowerCase(Locale.getDefault()).startsWith("c.")) {
+                  else if(tmp_line.toLowerCase(Locale.ROOT).startsWith("c.")) {
                      // the author line for bloomberg is a multiline starting 
with c.2011 Bloomberg News
                      // then containing the author info on the next line
                      if (val_next == TB) {
@@ -630,7 +630,7 @@ public class IptcAnpaParser implements P
                         continue;
                      }
                   }
-                  else 
if(tmp_line.toLowerCase(Locale.getDefault()).trim().startsWith("(") && 
tmp_line.toLowerCase(Locale.getDefault()).trim().endsWith(")")) {
+                  else 
if(tmp_line.toLowerCase(Locale.ROOT).trim().startsWith("(") && 
tmp_line.toLowerCase(Locale.ROOT).trim().endsWith(")")) {
                      // the author line may have one or more comment lines 
between the copyright
                      // statement, and the By AUTHORNAME line
                      if (val_next == TB) {
@@ -640,7 +640,7 @@ public class IptcAnpaParser implements P
                   }
                }
 
-               else if 
(tmp_line.toLowerCase(Locale.getDefault()).startsWith("eds") || 
longline.equals("bdy_source")) {
+               else if (tmp_line.toLowerCase(Locale.ROOT).startsWith("eds") || 
longline.equals("bdy_source")) {
                   longkey = "bdy_source";
                   // prepend a space to subsequent line, so it gets parsed 
consistent with the lead line
                   tmp_line = (longline.equals(longkey) ? " " : "") + tmp_line;
@@ -737,14 +737,14 @@ public class IptcAnpaParser implements P
                   // standard reuters format
                   format_in = "HH:mm MM-dd-yy";
                }
-               SimpleDateFormat dfi =   new SimpleDateFormat(format_in, 
Locale.getDefault());
+               SimpleDateFormat dfi = new SimpleDateFormat(format_in, 
Locale.ROOT);
                dfi.setTimeZone(TimeZone.getTimeZone("UTC"));
                dateunix = dfi.parse(ftr_datetime);
             }
             catch (ParseException ep) {
                // failed, but this will just fall through to setting the date 
to now
             }
-            SimpleDateFormat dfo =   new SimpleDateFormat(format_out, 
Locale.getDefault());
+            SimpleDateFormat dfo =   new SimpleDateFormat(format_out, 
Locale.ROOT);
             dfo.setTimeZone(TimeZone.getTimeZone("UTC"));
             ftr_datetime = dfo.format(dateunix);
          }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
 Wed Aug 13 16:52:39 2014
@@ -167,7 +167,7 @@ public class MboxParser extends Abstract
       return; // ignore malformed header lines
     }
 
-    String headerTag = headerMatcher.group(1).toLowerCase(Locale.getDefault());
+    String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
     String headerContent = headerMatcher.group(2);
 
     if (headerTag.equalsIgnoreCase("From")) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 Wed Aug 13 16:52:39 2014
@@ -127,7 +127,7 @@ public class OutlookExtractor extends Ab
                  String[] headers = msg.getHeaders();
                  if(headers != null && headers.length > 0) {
                      for(String header: headers) {
-                        
if(header.toLowerCase(Locale.getDefault()).startsWith("date:")) {
+                        
if(header.toLowerCase(Locale.ROOT).startsWith("date:")) {
                             String date = 
header.substring(header.indexOf(':')+1).trim();
                             
                             // See if we can parse it as a normal mail date

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
 Wed Aug 13 16:52:39 2014
@@ -549,7 +549,7 @@ public class WordExtractor extends Abstr
            tag = "h" + Math.min(num, 6);
        } else {
            styleClass = styleName.replace(' ', '_');
-           styleClass = 
styleClass.substring(0,1).toLowerCase(Locale.getDefault()) +
+           styleClass = styleClass.substring(0,1).toLowerCase(Locale.ROOT) +
                styleClass.substring(1);
        }
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
 Wed Aug 13 16:52:39 2014
@@ -88,7 +88,7 @@ public class NSNormalizerContentHandler 
     @Override
     public InputSource resolveEntity(String publicId, String systemId)
             throws IOException, SAXException {
-        if ((systemId != null && 
systemId.toLowerCase(Locale.getDefault()).endsWith(".dtd"))
+        if ((systemId != null && 
systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
                 || DTD_PUBLIC_ID.equals(publicId)) {
             return new InputSource(new StringReader(""));
         } else {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 Wed Aug 13 16:52:39 2014
@@ -246,11 +246,11 @@ public class ZipContainerDetector implem
         String docType = coreType.substring(0, coreType.lastIndexOf('.'));
 
         // The Macro Enabled formats are a little special
-        if(docType.toLowerCase(Locale.getDefault()).endsWith("macroenabled")) {
-            docType = docType.toLowerCase(Locale.getDefault()) + ".12";
+        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabled")) {
+            docType = docType.toLowerCase(Locale.ROOT) + ".12";
         }
 
-        
if(docType.toLowerCase(Locale.getDefault()).endsWith("macroenabledtemplate")) {
+        if(docType.toLowerCase(Locale.ROOT).endsWith("macroenabledtemplate")) {
             docType = 
MACRO_TEMPLATE_PATTERN.matcher(docType).replaceAll("macroenabled.12");
         }
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
 Wed Aug 13 16:52:39 2014
@@ -103,9 +103,9 @@ class RTFObjDataParser {
         //readBytes tests for reading too many bytes
         byte[] embObjBytes = readBytes(is, dataSz);
 
-        if (className.toLowerCase(Locale.getDefault()).equals("package")){
+        if (className.toLowerCase(Locale.ROOT).equals("package")){
             return handlePackage(embObjBytes, metadata);
-        } else if 
(className.toLowerCase(Locale.getDefault()).equals("pbrush")) {
+        } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
             //simple bitmap bytes
             return embObjBytes;
         } else {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1617758&r1=1617757&r2=1617758&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 Wed Aug 13 16:52:39 2014
@@ -1341,7 +1341,7 @@ final class TextExtractor {
         if (inHeader) {
             if (nextMetaData != null) {
                 if (nextMetaData == TikaCoreProperties.CREATED) {
-                    Calendar cal = Calendar.getInstance(TimeZone.getDefault(), 
Locale.getDefault());
+                    Calendar cal = Calendar.getInstance(TimeZone.getDefault(), 
Locale.ROOT);
                     cal.set(year, month-1, day, hour, minute, 0);
                     metadata.set(nextMetaData, cal.getTime());
                 } else if (nextMetaData.isMultiValuePermitted()) {


Reply via email to