Author: mattmann
Date: Wed Feb 25 06:47:13 2015
New Revision: 1662171

URL: http://svn.apache.org/r1662171
Log:
Updated tests for TIKA-1541 simple strings parser from Guiseppe Totaro.

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
 Wed Feb 25 06:47:13 2015
@@ -15,6 +15,9 @@ package org.apache.tika.parser.strings;
 
 import java.io.File;
 import java.io.Serializable;
+import java.util.Properties;
+import java.io.InputStream;
+import java.io.IOException;
 
 /**
  * Configuration for the "strings" (or strings-alternative) command.
@@ -27,10 +30,10 @@ public class StringsConfig implements Se
        private static final long serialVersionUID = -1465227101645003594L;
 
        private String stringsPath = "";
-       
+
        // Minimum sequence length (characters) to print
        private int minLength = 4;
-       
+
        // Character encoding of the strings that are to be found
        private StringsEncoding encoding = StringsEncoding.SINGLE_7_BIT;
 
@@ -38,10 +41,57 @@ public class StringsConfig implements Se
        private int timeout = 120;
 
        /**
-        * Default constructor.
+        * Default contructor.
         */
        public StringsConfig() {
-               // TODO Loads properties from InputStream.
+               init(this.getClass().getResourceAsStream("Strings.properties"));
+       }
+
+       /**
+        * Loads properties from InputStream and then tries to close 
InputStream. If
+        * there is an IOException, this silently swallows the exception and 
goes
+        * back to the default.
+        *
+        * @param is
+        */
+       public StringsConfig(InputStream is) {
+               init(is);
+       }
+
+       /**
+        * Initializes attributes.
+        *
+        * @param is
+        */
+       private void init(InputStream is) {
+               if (is == null) {
+                       return;
+               }
+               Properties props = new Properties();
+               try {
+                       props.load(is);
+               } catch (IOException e) {
+                       // swallow
+               } finally {
+                       if (is != null) {
+                               try {
+                                       is.close();
+                               } catch (IOException e) {
+                                       // swallow
+                               }
+                       }
+               }
+
+               setStringsPath(props.getProperty("stringsPath", "" + 
getStringsPath()));
+               
+               setMinLength(Integer.parseInt(props.getProperty("minLength", ""
+                               + getMinLength())));
+
+               
setEncoding(StringsEncoding.valueOf(props.getProperty("encoding", ""
+                               + getEncoding().get())));
+
+               setTimeout(Integer.parseInt(props.getProperty("timeout", ""
+                               + getTimeout())));
        }
 
        /**
@@ -52,7 +102,7 @@ public class StringsConfig implements Se
        public String getStringsPath() {
                return this.stringsPath;
        }
-       
+
        /**
         * Returns the minimum sequence length (characters) to print.
         * 
@@ -61,11 +111,12 @@ public class StringsConfig implements Se
        public int getMinLength() {
                return this.minLength;
        }
-       
+
        /**
         * Returns the character encoding of the strings that are to be found.
         * 
-        * @return {@see StringsEncoding} enum that represents the character 
encoding of the strings that are to be found.
+        * @return {@see StringsEncoding} enum that represents the character
+        *         encoding of the strings that are to be found.
         */
        public StringsEncoding getEncoding() {
                return this.encoding;
@@ -85,40 +136,52 @@ public class StringsConfig implements Se
        /**
         * Sets the "strings" installation folder.
         * 
-        * @param path the "strings" installation folder.
+        * @param path
+        *            the "strings" installation folder.
         */
        public void setStringsPath(String path) {
-               char lastChar = path.charAt(path.length() - 1);
-
-               if (lastChar != File.separatorChar) {
+               if (!path.isEmpty() && !path.endsWith(File.separator)) {
                        path += File.separatorChar;
                }
                this.stringsPath = path;
        }
-       
+
        /**
         * Sets the minimum sequence length (characters) to print.
         * 
-        * @param minLength the minimum sequence length (characters) to print.
+        * @param minLength
+        *            the minimum sequence length (characters) to print.
         */
        public void setMinLength(int minLength) {
+               if (minLength < 1) {
+                       throw new IllegalArgumentException("Invalid minimum 
length");
+               }
                this.minLength = minLength;
        }
-       
+
        /**
         * Sets the character encoding of the strings that are to be found.
         * 
-        * @param encoding {@see StringsEncoding} enum that represents the 
character encoding of the strings that are to be found.
+        * @param encoding
+        *            {@see StringsEncoding} enum that represents the character
+        *            encoding of the strings that are to be found.
         */
-       public void setEncodings(StringsEncoding encoding) {
+       public void setEncoding(StringsEncoding encoding) {
                this.encoding = encoding;
        }
 
        /**
-        * Sets the maximum time (in seconds) to wait for the "strings" command 
to terminate.
-        * @param timeout the maximum time (in seconds) to wait for the 
"strings" command to terminate.
+        * Sets the maximum time (in seconds) to wait for the "strings" command 
to
+        * terminate.
+        * 
+        * @param timeout
+        *            the maximum time (in seconds) to wait for the "strings"
+        *            command to terminate.
         */
        public void setTimeout(int timeout) {
+               if (timeout < 1) {
+                       throw new IllegalArgumentException("Invalid timeout");
+               }
                this.timeout = timeout;
        }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
 Wed Feb 25 06:47:13 2015
@@ -18,7 +18,7 @@ package org.apache.tika.parser.strings;
  *
  */
 public enum StringsEncoding {
-       SINGLE_7_BIT('s', "single-7-bit-byte"),
+       SINGLE_7_BIT('s', "single-7-bit-byte"), // default
        SINGLE_8_BIT('S', "single-8-bit-byte"),
        BIGENDIAN_16_BIT('b', "16-bit bigendian"),
        LITTLEENDIAN_16_BIT('l', "16-bit littleendian"),

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/strings/StringsParser.java
 Wed Feb 25 06:47:13 2015
@@ -63,7 +63,11 @@ public class StringsParser extends Abstr
        
        private static final FileConfig DEFAULT_FILE_CONFIG = new FileConfig();
        
-       // String -> Boolean[2] (0 -> is_present. 1 -> supports_encoding)
+       /*
+        * This map is organized as follows:
+        * command's pathname (String) -> is it present? (Boolean), does it 
support -e option? (Boolean)
+        * It stores check results for command and, if present, -e (encoding) 
option.
+        */
        private static Map<String,Boolean[]> STRINGS_PRESENT = new 
HashMap<String, Boolean[]>();
 
        @Override
@@ -121,24 +125,32 @@ public class StringsParser extends Abstr
                }
 
                String[] checkCmd = { stringsProg, "--version" };
+               try {
+                       boolean hasStrings = ExternalParser.check(checkCmd);
 
-               boolean hasStrings = ExternalParser.check(checkCmd);
-               
-               boolean encodingOpt = false;
-               
-               // Check if the -e option (encoding) is supported
-               if (!System.getProperty("os.name").startsWith("Windows")) {
-                       String[] checkOpt = {stringsProg, "-e", "" + 
config.getEncoding().get(), "/dev/null"};
-                       int[] errorValues = {1, 2}; // 1: General error. 2: 
Incorrect usage.
-                       encodingOpt = ExternalParser.check(checkOpt, 
errorValues);
-               }
+                       boolean encodingOpt = false;
+
+                       // Check if the -e option (encoding) is supported
+                       if 
(!System.getProperty("os.name").startsWith("Windows")) {
+                               String[] checkOpt = {stringsProg, "-e", "" + 
config.getEncoding().get(), "/dev/null"};
+                               int[] errorValues = {1, 2}; // Exit status 
code: 1 = general error; 2 = incorrect usage.
+                               encodingOpt = ExternalParser.check(checkOpt, 
errorValues);
+                       }
                
-               Boolean[] values = {hasStrings, encodingOpt};
-               STRINGS_PRESENT.put(stringsProg, values);
+                       Boolean[] values = {hasStrings, encodingOpt};
+                       STRINGS_PRESENT.put(stringsProg, values);
 
-               return hasStrings;
+                       return hasStrings;
+               } catch (NoClassDefFoundError ncdfe) {
+                       // This happens under OSGi + Fork Parser - see TIKA-1507
+                       // As a workaround for now, just say we can't use 
strings
+                       // TODO Resolve it so we don't need this try/catch block
+                       Boolean[] values = {false, false};
+                       STRINGS_PRESENT.put(stringsProg, values);
+                       return false;
+               }
        }
-       
+
        /**
         * Checks if the "file" command is supported.
         * 
@@ -183,7 +195,7 @@ public class StringsParser extends Abstr
                cmdList.add(stringsProg);
                cmdList.add("-n");
                cmdList.add("" + config.getMinLength());;
-               // encoding option is not supported by windows version
+               // Currently, encoding option is not supported by Windows (and 
other) versions
                if (STRINGS_PRESENT.get(stringsProg)[1]) {
                        cmdList.add("-e");
                        cmdList.add("" + config.getEncoding().get());
@@ -191,7 +203,7 @@ public class StringsParser extends Abstr
                cmdList.add(input.getPath());
                
                String[] cmd = cmdList.toArray(new String[cmdList.size()]);
-
+               
                ProcessBuilder pb = new ProcessBuilder(cmd);
                final Process process = pb.start();
 
@@ -312,10 +324,8 @@ public class StringsParser extends Abstr
                        fileOutput = reader.readLine();
 
                } catch (IOException ioe) {
-                       // TODO
-                       System.err
-                                       .println("An error occurred in reading 
output of the file command: "
-                                                       + ioe.getMessage());
+                       // file output not available!
+                       fileOutput = "";
                } finally {
                        reader.close();
                }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsConfigTest.java
 Wed Feb 25 06:47:13 2015
@@ -15,6 +15,9 @@ package org.apache.tika.parser.strings;
 
 import static org.junit.Assert.*;
 
+import java.io.File;
+import java.io.InputStream;
+
 import org.junit.Test;
 
 public class StringsConfigTest {
@@ -27,4 +30,32 @@ public class StringsConfigTest {
                assertEquals("Invalid default min-len value", 4, 
config.getMinLength());
                assertEquals("Invalid default timeout value", 120, 
config.getTimeout());
        }
+       
+       @Test
+       public void testPartialConfig() {
+               InputStream stream = 
StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-partial.properties");
+               
+               StringsConfig config = new StringsConfig(stream);
+               assertEquals("Invalid default stringsPath value", "", 
config.getStringsPath());
+               assertEquals("Invalid overridden encoding value", 
StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+               assertEquals("Invalid default min-len value", 4, 
config.getMinLength());
+               assertEquals("Invalid overridden timeout value", 60, 
config.getTimeout());
+       }
+       
+       @Test
+       public void testFullConfig() {
+               InputStream stream = 
StringsConfigTest.class.getResourceAsStream("/test-properties/StringsConfig-full.properties");
+               
+               StringsConfig config = new StringsConfig(stream);
+               assertEquals("Invalid overridden stringsPath value", 
"/opt/strings" + File.separator, config.getStringsPath());
+               assertEquals("Invalid overridden encoding value", 
StringsEncoding.BIGENDIAN_16_BIT, config.getEncoding());
+               assertEquals("Invalid overridden min-len value", 3, 
config.getMinLength());
+               assertEquals("Invalid overridden timeout value", 60, 
config.getTimeout());
+       }
+       
+       @Test(expected=IllegalArgumentException.class)
+       public void testValidateEconding() {
+               StringsConfig config = new StringsConfig();
+               config.setMinLength(0);
+       }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java?rev=1662171&r1=1662170&r2=1662171&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
 Wed Feb 25 06:47:13 2015
@@ -18,6 +18,7 @@ import static org.junit.Assert.*;
 import static org.junit.Assume.assumeTrue;
 
 import java.io.InputStream;
+import java.util.Arrays;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
@@ -42,12 +43,15 @@ public class StringsParserTest {
                String resource = "/test-documents/testOCTET_header.dbase3";
 
                String[] content = { "CLASSNO", "TITLE", "ITEMNO", "LISTNO", 
"LISTDATE" };
+               
+               String[] met_attributes = {"min-len", "encoding", 
"strings:file_output"};
 
                StringsConfig stringsConfig = new StringsConfig();
                FileConfig fileConfig = new FileConfig();
 
                Parser parser = new StringsParser();
                ContentHandler handler = new BodyContentHandler();
+               Metadata metadata = new Metadata();
 
                ParseContext context = new ParseContext();
                context.set(StringsConfig.class, stringsConfig);
@@ -56,15 +60,19 @@ public class StringsParserTest {
                InputStream stream = 
StringsParserTest.class.getResourceAsStream(resource);
 
                try {
-                       parser.parse(stream, handler, new Metadata(), context);
+                       parser.parse(stream, handler, metadata, context);
                } catch (Exception e) {
                        e.printStackTrace();
                } finally {
                        stream.close();
                }
-               
+
+               // Content
                for (String word : content) {
                        assertTrue(handler.toString().contains(word));
                }
+               
+               // Metadata
+               Arrays.equals(met_attributes, metadata.names());
        }
 }


Reply via email to