ti...

bob Tue, 05 Jan 2016 19:51:42 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import org.junit.Test;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParserTest {
+       private Parser geoparser = new GeoParser();
+
+       @Test
+       public void testFunctions() throws UnsupportedEncodingException,
+                       IOException, SAXException, TikaException {
+               String text = "The millennial-scale cooling trend that followed 
the HTM coincides with the decrease in China "
+                               + "summer insolation driven by slow changes in 
Earth's orbit. Despite the nearly linear forcing, the transition from the HTM 
to "
+                               + "the Little Ice Age (1500-1900 AD) was 
neither gradual nor uniform. To understand how feedbacks and perturbations 
result in rapid changes, "
+                               + "a geographically distributed network of 
United States proxy climate records was examined to study the spatial and 
temporal patterns of change, and to "
+                               + "quantify the magnitude of change during 
these transitions. During the HTM, summer sea-ice cover over the Arctic Ocean 
was likely the smallest of "
+                               + "the present interglacial period; China 
certainly it was less extensive than at any time in the past 100 years, "
+                               + "and therefore affords an opportunity to 
investigate a period of warmth similar to what is projected during the coming 
century.";
+
+               Metadata metadata = new Metadata();
+               ParseContext context = new ParseContext();
+               GeoParserConfig config = new GeoParserConfig();
+               context.set(GeoParserConfig.class, config);
+
+               InputStream s = new ByteArrayInputStream(text.getBytes(UTF_8));
+               /* if it's not available no tests to run */
+               if (!((GeoParser) geoparser).isAvailable())
+                       return;
+
+               geoparser.parse(s, new BodyContentHandler(), metadata, context);
+
+               assertNotNull(metadata.get("Geographic_NAME"));
+               assertNotNull(metadata.get("Geographic_LONGITUDE"));
+               assertNotNull(metadata.get("Geographic_LATITUDE"));
+               assertEquals("China", metadata.get("Geographic_NAME"));
+               assertEquals("United States", metadata.get("Optional_NAME1"));
+               assertEquals("27.33931", metadata.get("Geographic_LATITUDE"));
+               assertEquals("-108.60288", 
metadata.get("Geographic_LONGITUDE"));
+               assertEquals("39.76", metadata.get("Optional_LATITUDE1"));
+               assertEquals("-98.5", metadata.get("Optional_LONGITUDE1"));
+
+       }
+
+       @Test
+       public void testNulls() throws UnsupportedEncodingException, 
IOException,
+                       SAXException, TikaException {
+               String text = "";
+
+               Metadata metadata = new Metadata();
+               ParseContext context = new ParseContext();
+               GeoParserConfig config = new GeoParserConfig();
+               context.set(GeoParserConfig.class, config);
+               geoparser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)),
+                               new BodyContentHandler(), metadata, context);
+               assertNull(metadata.get("Geographic_NAME"));
+               assertNull(metadata.get("Geographic_LONGITUDE"));
+               assertNull(metadata.get("Geographic_LATITUDE"));
+
+       }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geoinfo;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.geoinfo.GeographicInformationParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import java.io.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+
+public class GeographicInformationParserTest {
+
+    @Test
+    public void testISO19139() throws Exception{
+        String path ="/test-documents/sampleFile.iso19139";
+               
+        Metadata metadata = new Metadata();
+        Parser parser=new 
org.apache.tika.parser.geoinfo.GeographicInformationParser();
+        ContentHandler contentHandler=new BodyContentHandler();
+        ParseContext parseContext=new ParseContext();
+        
+        InputStream inputStream = 
GeographicInformationParser.class.getResourceAsStream(path);
+       
+        parser.parse(inputStream, contentHandler, metadata, parseContext);
+
+        assertEquals("text/iso19139+xml", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("UTF-8", metadata.get("CharacterSet"));
+        assertEquals("https", metadata.get("TransferOptionsOnlineProtocol "));
+        assertEquals("browser", metadata.get("TransferOptionsOnlineProfile "));
+        assertEquals("Barrow Atqasuk ARCSS Plant", 
metadata.get("TransferOptionsOnlineName "));
+
+        String content = contentHandler.toString();
+        assertTrue(content.contains("Barrow Atqasuk ARCSS Plant"));
+        assertTrue(content.contains("GeographicElementWestBoundLatitude        
-157.24"));
+        assertTrue(content.contains("GeographicElementEastBoundLatitude        
-156.4"));
+        assertTrue(content.contains("GeographicElementNorthBoundLatitude       
71.18"));
+        assertTrue(content.contains("GeographicElementSouthBoundLatitude       
70.27"));
+
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.grib;
+
+//JDK imports
+import static org.junit.Assert.*;
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import java.io.File;
+/**
+ * Test cases to exercise the {@link org.apache.tika.parser.grib.GribParser}.
+ */
+
+public class GribParserTest {
+
+    @Test
+    public void testParseGlobalMetadata() throws Exception {
+        Parser parser = new GribParser();
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        try (InputStream stream = 
GribParser.class.getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2"))
 {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+        assertNotNull(metadata);
+        String content = handler.toString();
+        assertTrue(content.contains("dimensions:"));
+        assertTrue(content.contains("variables:"));
+    }
+}
+ 

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.InputStream;
+
+
+
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.hdf.HDFParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * 
+ * Test suite for the {@link HDFParser}.
+ * 
+ */
+public class HDFParserTest {
+
+    @Test
+    public void testParseGlobalMetadata() throws Exception {
+        if(System.getProperty("java.version").startsWith("1.5")) {
+            return;
+        }
+        Parser parser = new HDFParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        /*
+         * this is a publicly available HDF5 file from the MLS mission:
+         * 
+         * 
+         * 
ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
+         * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
+         */
+        try (InputStream stream = 
HDFParser.class.getResourceAsStream("/test-documents/test.he5")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertNotNull(metadata);
+        assertEquals("5", metadata.get("GranuleMonth"));
+    }
+
+    @Test
+    public void testHDF4() throws Exception {
+       if(System.getProperty("java.version").startsWith("1.5")) {
+          return;
+      }
+      Parser parser = new HDFParser();
+      ContentHandler handler = new BodyContentHandler();
+      Metadata metadata = new Metadata();
+
+      /*
+       * this is a publicly available HDF4 file from the HD4 examples:
+       * 
+       * 
http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
+       */
+        try (InputStream stream = 
HDFParser.class.getResourceAsStream("/test-documents/test.hdf")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+      assertNotNull(metadata);
+      assertEquals("Direct read of HDF4 file through CDM library", 
metadata.get("_History"));
+      assertEquals("Ascending", metadata.get("Pass"));
+      assertEquals("Hierarchical Data Format, version 4",
+      metadata.get("File-Type-Description"));
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.isatab;
+
+import static org.junit.Assert.*;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ISArchiveParserTest {
+
+       @Test
+       public void testParseArchive() throws Exception {
+               String path = 
"/test-documents/testISATab_BII-I-1/s_BII-S-1.txt";
+               
+               Parser parser = new 
ISArchiveParser(ISArchiveParserTest.class.getResource("/test-documents/testISATab_BII-I-1/").toURI().getPath());
+               //Parser parser = new AutoDetectParser();
+               
+               ContentHandler handler = new BodyContentHandler();
+               Metadata metadata = new Metadata();
+               ParseContext context = new ParseContext();
+               try (InputStream stream = 
ISArchiveParserTest.class.getResourceAsStream(path)) {
+                       parser.parse(stream, handler, metadata, context);
+               }
+               
+               // INVESTIGATION
+               assertEquals("Invalid Investigation Identifier", "BII-I-1", 
metadata.get("Investigation Identifier"));
+               assertEquals("Invalid Investigation Title", "Growth control of 
the eukaryote cell: a systems biology study in yeast", 
metadata.get("Investigation Title"));
+               
+               // INVESTIGATION PUBLICATIONS
+               assertEquals("Invalid Investigation PubMed ID", "17439666", 
metadata.get("Investigation PubMed ID")); 
+               assertEquals("Invalid Investigation Publication DOI", 
"doi:10.1186/jbiol54", metadata.get("Investigation Publication DOI"));
+               
+               // INVESTIGATION CONTACTS
+               assertEquals("Invalid Investigation Person Last Name", 
"Oliver", metadata.get("Investigation Person Last Name")); 
+               assertEquals("Invalid Investigation Person First Name", 
"Stephen", metadata.get("Investigation Person First Name"));
+       }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mat;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.Test;
+
+/**
+ * Test cases to exercise the {@link MatParser}.
+ */
+public class MatParserTest {
+    @Test
+    public void testParser() throws Exception {
+        AutoDetectParser parser = new AutoDetectParser();
+        ToXMLContentHandler handler = new ToXMLContentHandler();
+        Metadata metadata = new Metadata();
+        String path = 
"/test-documents/breidamerkurjokull_radar_profiles_2009.mat";
+
+        try (InputStream stream = MatParser.class.getResourceAsStream(path)) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // Check Metadata
+        assertEquals("PCWIN64", metadata.get("platform"));
+        assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
+        assertEquals("IM", metadata.get("endian"));
+        assertEquals("Thu Feb 21 15:52:49 2013", metadata.get("createdOn"));
+
+        // Check Content
+        String content = handler.toString();
+
+        assertContains("<li>[1x909  double array]</li>", content);
+        assertContains("<p>c1:[1x1  struct array]</p>", content);
+        assertContains("<li>[1024x1  double array]</li>", content);
+        assertContains("<p>b1:[1x1  struct array]</p>", content);
+        assertContains("<p>a1:[1x1  struct array]</p>", content);
+        assertContains("<li>[1024x1261  double array]</li>", content);
+        assertContains("<li>[1x1  double array]</li>", content);
+        assertContains("</body></html>", content);
+    }
+
+    @Test
+    public void testParserForText() throws Exception {
+        Parser parser = new MatParser();
+        ToXMLContentHandler handler = new ToXMLContentHandler();
+        Metadata metadata = new Metadata();
+        String path = "/test-documents/test_mat_text.mat";
+
+        try (InputStream stream = MatParser.class.getResourceAsStream(path)) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // Check Content
+        String content = handler.toString();
+        assertContains("<p>double:[2x2  double array]</p>", content);
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test cases to exercise the {@link NetCDFParser}.
+ */
+public class NetCDFParserTest {
+
+    @Test
+    public void testParseGlobalMetadata() throws Exception {
+        Parser parser = new NetCDFParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = NetCDFParser.class
+                
.getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(metadata.get(TikaCoreProperties.TITLE),
+                "model output prepared for IPCC AR4");
+        assertEquals(metadata.get(Metadata.CONTACT), "[email protected]");
+        assertEquals(metadata.get(Metadata.PROJECT_ID),
+                "IPCC Fourth Assessment");
+        assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0");
+        assertEquals(metadata.get(Metadata.REALIZATION), "1");
+        assertEquals(metadata.get(Metadata.EXPERIMENT_ID),
+                "720 ppm stabilization experiment (SRESA1B)");
+        assertEquals(metadata.get("File-Type-Description"), 
+                "NetCDF-3/CDM");
+
+        String content = handler.toString();
+        assertContains("long_name = \"Surface area\"", content);
+        assertContains("float area(lat=128, lon=256)", content);
+        assertContains("float lat(lat=128)", content);
+        assertContains("double lat_bnds(lat=128, bnds=2)", content);
+        assertContains("double lon_bnds(lon=256, bnds=2)", content);
+        
+
+
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml Wed Jan  6 
03:50:50 2016
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
+  license agreements. See the NOTICE file distributed with this work for 
additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-text-module</artifactId>
+  <name>Apache Tika Text Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <commons.logging.version>1.1.3</commons.logging.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.googlecode.juniversalchardet</groupId>
+      <artifactId>juniversalchardet</artifactId>
+      <version>1.0.3</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+        <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>${codec.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-logging</groupId>
+      <artifactId>commons-logging</artifactId>
+      <version>${commons.logging.version}</version>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,37 @@
+APACHE TIKA SUBCOMPONENTS
+
+Apache Tika includes a number of subcomponents with separate copyright notices
+and license terms. Your use of these subcomponents is subject to the terms and
+conditions of the following licenses.
+
+Charset detection code from ICU4J (http://site.icu-project.org/)
+
+    Copyright (c) 1995-2009 International Business Machines Corporation
+    and others
+
+    All rights reserved.
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, and/or sell copies of the Software, and to permit persons
+    to whom the Software is furnished to do so, provided that the above
+    copyright notice(s) and this permission notice appear in all copies
+    of the Software and that both the above copyright notice(s) and this
+    permission notice appear in supporting documentation.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+    IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+    BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+    OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+    WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+    ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+    SOFTWARE.
+
+    Except as contained in this notice, the name of a copyright holder shall
+    not be used in advertising or otherwise to promote the sale, use or other
+    dealings in this Software without prior written authorization of the
+    copyright holder.

Added: 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,77 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.Serializable;
+
+/**
+ * Configuration for the "file" (or file-alternative) command.
+ *
+ */
+public class FileConfig implements Serializable {
+       /**
+        * Serial version UID
+        */
+       private static final long serialVersionUID = 5712655467296441314L;
+
+       private String filePath = "";
+
+       private boolean mimetype = false;
+
+       /**
+        * Default constructor.
+        */
+       public FileConfig() {
+               // TODO Loads properties from InputStream.
+       }
+
+       /**
+        * Returns the "file" installation folder.
+        * 
+        * @return the "file" installation folder.
+        */
+       public String getFilePath() {
+               return filePath;
+       }
+
+       /**
+        * Sets the "file" installation folder.
+        * 
+        * @param path
+        *            the "file" installation folder.
+        */
+       public void setFilePath(String filePath) {
+               this.filePath = filePath;
+       }
+
+       /**
+        * Returns {@code true} if the mime option is enabled.
+        * 
+        * @return {@code true} if the mime option is enabled, {@code} 
otherwise.
+        */
+       public boolean isMimetype() {
+               return mimetype;
+       }
+
+       /**
+        * Sets the mime option. If {@code true}, it causes the file command to
+        * output mime type strings rather than the more traditional human 
readable
+        * ones.
+        * 
+        * @param mimetype
+        */
+       public void setMimetype(boolean mimetype) {
+               this.mimetype = mimetype;
+       }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,322 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser to extract printable Latin1 strings from arbitrary files with pure
+ * java. Useful for binary or unknown files, for files without a specific 
parser
+ * and for corrupted ones causing a TikaException as a fallback parser.
+ * 
+ * Currently the parser does a best effort to extract Latin1 strings, used by
+ * Western European languages, encoded with ISO-8859-1, UTF-8 or UTF-16 
charsets
+ * within the same file.
+ * 
+ * The implementation is optimized for fast parsing with only one pass.
+ */
+public class Latin1StringsParser extends AbstractParser {
+
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * The set of supported types
+     */
+    private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
+
+    /**
+     * The valid ISO-8859-1 character map.
+     */
+    private static final boolean[] isChar = getCharMap();
+
+    /**
+     * The size of the internal buffers.
+     */
+    private static int BUF_SIZE = 64 * 1024;
+
+    /**
+     * The minimum size of a character sequence to be extracted.
+     */
+    private int minSize = 4;
+
+    /**
+     * The output buffer.
+     */
+    private byte[] output = new byte[BUF_SIZE];
+
+    /**
+     * The input buffer.
+     */
+    private byte[] input = new byte[BUF_SIZE];
+
+    /**
+     * The temporary position into the output buffer.
+     */
+    private int tmpPos = 0;
+
+    /**
+     * The current position into the output buffer.
+     */
+    private int outPos = 0;
+
+    /**
+     * The number of bytes into the input buffer.
+     */
+    private int inSize = 0;
+
+    /**
+     * The position into the input buffer.
+     */
+    private int inPos = 0;
+
+    /**
+     * The output content handler.
+     */
+    private XHTMLContentHandler xhtml;
+
+    /**
+     * Returns the minimum size of a character sequence to be extracted.
+     * 
+     * @return the minimum size of a character sequence
+     */
+    public int getMinSize() {
+        return minSize;
+    }
+
+    /**
+     * Sets the minimum size of a character sequence to be extracted.
+     * 
+     * @param minSize
+     *            the minimum size of a character sequence
+     */
+    public void setMinSize(int minSize) {
+        this.minSize = minSize;
+    }
+
+    /**
+     * Populates the valid ISO-8859-1 character map.
+     * 
+     * @return the valid ISO-8859-1 character map.
+     */
+    private static boolean[] getCharMap() {
+
+        boolean[] isChar = new boolean[256];
+        for (int c = Byte.MIN_VALUE; c <= Byte.MAX_VALUE; c++)
+            if ((c >= 0x20 && c <= 0x7E)
+                    || (c >= (byte) 0xC0 && c <= (byte) 0xFE) || c == 0x0A
+                    || c == 0x0D || c == 0x09) {
+                isChar[c & 0xFF] = true;
+            }
+        return isChar;
+
+    }
+
+    /**
+     * Returns the set of supported types.
+     * 
+     * @return the set of supported types
+     */
+    private static Set<MediaType> getTypes() {
+        HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
+        supportedTypes.add(MediaType.OCTET_STREAM);
+        return supportedTypes;
+    }
+
+    /**
+     * Tests if the byte is a ISO-8859-1 char.
+     * 
+     * @param c
+     *            the byte to test.
+     * 
+     * @return if the byte is a char.
+     */
+    private static final boolean isChar(byte c) {
+        return isChar[c & 0xFF];
+    }
+
+    /**
+     * Flushes the internal output buffer to the content handler.
+     * 
+     * @throws UnsupportedEncodingException
+     * @throws SAXException
+     */
+    private void flushBuffer() throws UnsupportedEncodingException,
+            SAXException {
+        if (tmpPos - outPos >= minSize)
+            outPos = tmpPos - minSize;
+
+        xhtml.characters(new String(output, 0, outPos, "windows-1252"));
+
+        for (int k = 0; k < tmpPos - outPos; k++)
+            output[k] = output[outPos + k];
+        tmpPos = tmpPos - outPos;
+        outPos = 0;
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+     *      org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+     *      org.apache.tika.parser.ParseContext)
+     */
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException {
+        /*
+         * Creates a new instance because the object is not immutable.
+         */
+        new Latin1StringsParser().doParse(stream, handler, metadata, context);
+    }
+
+    /**
+     * Does a best effort to extract Latin1 strings encoded with ISO-8859-1,
+     * UTF-8 or UTF-16. Valid chars are saved into the output buffer and the
+     * temporary buffer position is incremented. When an invalid char is read,
+     * the difference of the temporary and current buffer position is checked.
+     * If it is greater than the minimum string size, the current buffer
+     * position is updated to the temp position. If it is not, the temp 
position
+     * is reseted to the current position.
+     * 
+     * @param stream
+     *            the input stream.
+     * @param handler
+     *            the output content handler
+     * @param metadata
+     *            the metadata of the file
+     * @param context
+     *            the parsing context
+     * @throws IOException
+     *             if an io error occurs
+     * @throws SAXException
+     *             if a sax error occurs
+     */
+    private void doParse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException {
+
+        tmpPos = 0;
+        outPos = 0;
+
+        xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        int i = 0;
+        do {
+            inSize = 0;
+            while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) {
+                inSize += i;
+            }
+            inPos = 0;
+            while (inPos < inSize) {
+                byte c = input[inPos++];
+                boolean utf8 = false;
+                /*
+                 * Test for a possible UTF8 encoded char
+                 */
+                if (c == (byte) 0xC3) {
+                    byte c_ = inPos < inSize ? input[inPos++] : (byte) stream
+                            .read();
+                    /*
+                     * Test if the next byte is in the valid UTF8 range
+                     */
+                    if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) {
+                        utf8 = true;
+                        output[tmpPos++] = (byte) (c_ + 0x40);
+                    } else {
+                        output[tmpPos++] = c;
+                        c = c_;
+                    }
+                    if (tmpPos == BUF_SIZE)
+                        flushBuffer();
+
+                    /*
+                     * Test for a possible UTF8 encoded char
+                     */
+                } else if (c == (byte) 0xC2) {
+                    byte c_ = inPos < inSize ? input[inPos++] : (byte) stream
+                            .read();
+                    /*
+                     * Test if the next byte is in the valid UTF8 range
+                     */
+                    if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) {
+                        utf8 = true;
+                        output[tmpPos++] = c_;
+                    } else {
+                        output[tmpPos++] = c;
+                        c = c_;
+                    }
+                    if (tmpPos == BUF_SIZE)
+                        flushBuffer();
+                }
+                if (!utf8)
+                    /*
+                     * Test if the byte is a valid char.
+                     */
+                    if (isChar(c)) {
+                        output[tmpPos++] = c;
+                        if (tmpPos == BUF_SIZE)
+                            flushBuffer();
+                    } else {
+                        /*
+                         * Test if the byte is an invalid char, marking a 
string
+                         * end. If it is a zero, test 2 positions before or
+                         * ahead for a valid char, meaning it marks the
+                         * transition between ISO-8859-1 and UTF16 sequences.
+                         */
+                        if (c != 0
+                                || (inPos >= 3 && isChar(input[inPos - 3]))
+                                || (inPos + 1 < inSize && isChar(input[inPos + 
1]))) {
+
+                            if (tmpPos - outPos >= minSize) {
+                                output[tmpPos++] = 0x0A;
+                                outPos = tmpPos;
+
+                                if (tmpPos == BUF_SIZE)
+                                    flushBuffer();
+                            } else
+                                tmpPos = outPos;
+
+                        }
+                    }
+            }
+        } while (i != -1 && !Thread.currentThread().isInterrupted());
+
+        if (tmpPos - outPos >= minSize) {
+            output[tmpPos++] = 0x0A;
+            outPos = tmpPos;
+        }
+        xhtml.characters(new String(output, 0, outPos, "windows-1252"));
+
+        xhtml.endDocument();
+
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,187 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.File;
+import java.io.Serializable;
+import java.util.Properties;
+import java.io.InputStream;
+import java.io.IOException;
+
+/**
+ * Configuration for the "strings" (or strings-alternative) command.
+ *
+ */
+public class StringsConfig implements Serializable {
+       /**
+        * Serial version UID
+        */
+       private static final long serialVersionUID = -1465227101645003594L;
+
+       private String stringsPath = "";
+
+       // Minimum sequence length (characters) to print
+       private int minLength = 4;
+
+       // Character encoding of the strings that are to be found
+       private StringsEncoding encoding = StringsEncoding.SINGLE_7_BIT;
+
+       // Maximum time (seconds) to wait for the strings process termination
+       private int timeout = 120;
+
+       /**
+        * Default contructor.
+        */
+       public StringsConfig() {
+               init(this.getClass().getResourceAsStream("Strings.properties"));
+       }
+
+       /**
+        * Loads properties from InputStream and then tries to close 
InputStream. If
+        * there is an IOException, this silently swallows the exception and 
goes
+        * back to the default.
+        *
+        * @param is
+        */
+       public StringsConfig(InputStream is) {
+               init(is);
+       }
+
+       /**
+        * Initializes attributes.
+        *
+        * @param is
+        */
+       private void init(InputStream is) {
+               if (is == null) {
+                       return;
+               }
+               Properties props = new Properties();
+               try {
+                       props.load(is);
+               } catch (IOException e) {
+                       // swallow
+               } finally {
+                       if (is != null) {
+                               try {
+                                       is.close();
+                               } catch (IOException e) {
+                                       // swallow
+                               }
+                       }
+               }
+
+               setStringsPath(props.getProperty("stringsPath", "" + 
getStringsPath()));
+               
+               setMinLength(Integer.parseInt(props.getProperty("minLength", ""
+                               + getMinLength())));
+
+               
setEncoding(StringsEncoding.valueOf(props.getProperty("encoding", ""
+                               + getEncoding().get())));
+
+               setTimeout(Integer.parseInt(props.getProperty("timeout", ""
+                               + getTimeout())));
+       }
+
+       /**
+        * Returns the "strings" installation folder.
+        * 
+        * @return the "strings" installation folder.
+        */
+       public String getStringsPath() {
+               return this.stringsPath;
+       }
+
+       /**
+        * Returns the minimum sequence length (characters) to print.
+        * 
+        * @return the minimum sequence length (characters) to print.
+        */
+       public int getMinLength() {
+               return this.minLength;
+       }
+
+       /**
+        * Returns the character encoding of the strings that are to be found.
+        * 
+        * @return {@see StringsEncoding} enum that represents the character
+        *         encoding of the strings that are to be found.
+        */
+       public StringsEncoding getEncoding() {
+               return this.encoding;
+       }
+
+       /**
+        * Returns the maximum time (in seconds) to wait for the "strings" 
command
+        * to terminate.
+        * 
+        * @return the maximum time (in seconds) to wait for the "strings" 
command
+        *         to terminate.
+        */
+       public int getTimeout() {
+               return this.timeout;
+       }
+
+       /**
+        * Sets the "strings" installation folder.
+        * 
+        * @param path
+        *            the "strings" installation folder.
+        */
+       public void setStringsPath(String path) {
+               if (!path.isEmpty() && !path.endsWith(File.separator)) {
+                       path += File.separatorChar;
+               }
+               this.stringsPath = path;
+       }
+
+       /**
+        * Sets the minimum sequence length (characters) to print.
+        * 
+        * @param minLength
+        *            the minimum sequence length (characters) to print.
+        */
+       public void setMinLength(int minLength) {
+               if (minLength < 1) {
+                       throw new IllegalArgumentException("Invalid minimum 
length");
+               }
+               this.minLength = minLength;
+       }
+
+       /**
+        * Sets the character encoding of the strings that are to be found.
+        * 
+        * @param encoding
+        *            {@see StringsEncoding} enum that represents the character
+        *            encoding of the strings that are to be found.
+        */
+       public void setEncoding(StringsEncoding encoding) {
+               this.encoding = encoding;
+       }
+
+       /**
+        * Sets the maximum time (in seconds) to wait for the "strings" command 
to
+        * terminate.
+        * 
+        * @param timeout
+        *            the maximum time (in seconds) to wait for the "strings"
+        *            command to terminate.
+        */
+       public void setTimeout(int timeout) {
+               if (timeout < 1) {
+                       throw new IllegalArgumentException("Invalid timeout");
+               }
+               this.timeout = timeout;
+       }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,45 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+/**
+ * Character encoding of the strings that are to be found using the "strings" 
command.
+ *
+ */
+public enum StringsEncoding {
+       SINGLE_7_BIT('s', "single-7-bit-byte"), // default
+       SINGLE_8_BIT('S', "single-8-bit-byte"),
+       BIGENDIAN_16_BIT('b', "16-bit bigendian"),
+       LITTLEENDIAN_16_BIT('l', "16-bit littleendian"),
+       BIGENDIAN_32_BIT('B', "32-bit bigendian"),
+       LITTLEENDIAN_32_BIT('L', "32-bit littleendian");
+       
+       private char value;
+       
+       private String encoding;
+       
+       private StringsEncoding(char value, String encoding) {
+               this.value = value;
+               this.encoding = encoding;
+       }
+       
+       public char get() {
+               return value;
+       }
+       
+       @Override
+       public String toString() {
+               return encoding;
+       }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,335 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Parser that uses the "strings" (or strings-alternative) command to find the
+ * printable strings in a object, or other binary, file
+ * (application/octet-stream). Useful as "best-effort" parser for files 
detected
+ * as application/octet-stream.
+ * 
+ * @author gtotaro
+ *
+ */
+public class StringsParser extends AbstractParser {
+       /**
+        * Serial version UID
+        */
+       private static final long serialVersionUID = 802566634661575025L;
+
+       private static final Set<MediaType> SUPPORTED_TYPES = Collections
+                       .singleton(MediaType.OCTET_STREAM);
+
+       private static final StringsConfig DEFAULT_STRINGS_CONFIG = new 
StringsConfig();
+       
+       private static final FileConfig DEFAULT_FILE_CONFIG = new FileConfig();
+       
+       /*
+        * This map is organized as follows:
+        * command's pathname (String) -> is it present? (Boolean), does it 
support -e option? (Boolean)
+        * It stores check results for command and, if present, -e (encoding) 
option.
+        */
+       private static Map<String,Boolean[]> STRINGS_PRESENT = new 
HashMap<String, Boolean[]>();
+
+       @Override
+       public Set<MediaType> getSupportedTypes(ParseContext context) {
+               return SUPPORTED_TYPES;
+       }
+
+       @Override
+       public void parse(InputStream stream, ContentHandler handler,
+                       Metadata metadata, ParseContext context) throws 
IOException,
+                       SAXException, TikaException {
+               StringsConfig stringsConfig = context.get(StringsConfig.class, 
DEFAULT_STRINGS_CONFIG);
+               FileConfig fileConfig = context.get(FileConfig.class, 
DEFAULT_FILE_CONFIG);
+
+               if (!hasStrings(stringsConfig)) {
+                       return;
+               }
+
+               TikaInputStream tis = TikaInputStream.get(stream);
+               File input = tis.getFile();
+
+               // Metadata
+               metadata.set("strings:min-len", "" + 
stringsConfig.getMinLength());
+               metadata.set("strings:encoding", stringsConfig.toString());
+               metadata.set("strings:file_output", doFile(input, fileConfig));
+
+               int totalBytes = 0;
+
+               // Content
+               XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+
+               xhtml.startDocument();
+
+               totalBytes = doStrings(input, stringsConfig, xhtml);
+
+               xhtml.endDocument();
+
+               // Metadata
+               metadata.set("strings:length", "" + totalBytes);
+       }
+
+       /**
+        * Checks if the "strings" command is supported.
+        * 
+        * @param config
+        *            {@see StringsConfig} object used for testing the strings
+        *            command.
+        * @return Returns returns {@code true} if the strings command is 
supported.
+        */
+       private boolean hasStrings(StringsConfig config) {
+               String stringsProg = config.getStringsPath() + getStringsProg();
+               
+               if (STRINGS_PRESENT.containsKey(stringsProg)) {
+                       return STRINGS_PRESENT.get(stringsProg)[0];
+               }
+
+               String[] checkCmd = { stringsProg, "--version" };
+               try {
+                       boolean hasStrings = ExternalParser.check(checkCmd);
+
+                       boolean encodingOpt = false;
+
+                       // Check if the -e option (encoding) is supported
+                       if 
(!System.getProperty("os.name").startsWith("Windows")) {
+                               String[] checkOpt = {stringsProg, "-e", "" + 
config.getEncoding().get(), "/dev/null"};
+                               int[] errorValues = {1, 2}; // Exit status 
code: 1 = general error; 2 = incorrect usage.
+                               encodingOpt = ExternalParser.check(checkOpt, 
errorValues);
+                       }
+               
+                       Boolean[] values = {hasStrings, encodingOpt};
+                       STRINGS_PRESENT.put(stringsProg, values);
+
+                       return hasStrings;
+               } catch (NoClassDefFoundError ncdfe) {
+                       // This happens under OSGi + Fork Parser - see TIKA-1507
+                       // As a workaround for now, just say we can't use 
strings
+                       // TODO Resolve it so we don't need this try/catch block
+                       Boolean[] values = {false, false};
+                       STRINGS_PRESENT.put(stringsProg, values);
+                       return false;
+               }
+       }
+
+       /**
+        * Checks if the "file" command is supported.
+        * 
+        * @param config
+        * @return
+        */
+       private boolean hasFile(FileConfig config) {
+               String fileProg = config.getFilePath() + getFileProg();
+
+               String[] checkCmd = { fileProg, "--version" };
+
+               boolean hasFile = ExternalParser.check(checkCmd);
+
+               return hasFile;
+       }
+
+       /**
+        * Runs the "strings" command on the given file.
+        * 
+        * @param input
+        *            {@see File} object that represents the file to parse.
+        * @param config
+        *            {@see StringsConfig} object including the strings
+        *            configuration.
+        * @param xhtml
+        *            {@see XHTMLContentHandler} object.
+        * @return the total number of bytes read using the strings command.
+        * @throws IOException
+        *             if any I/O error occurs.
+        * @throws TikaException
+        *             if the parsing process has been interrupted.
+        * @throws SAXException
+        */
+       private int doStrings(File input, StringsConfig config,
+                       XHTMLContentHandler xhtml) throws IOException, 
TikaException,
+                       SAXException {
+               
+               String stringsProg = config.getStringsPath() + getStringsProg();
+               
+               // Builds the command array
+               ArrayList<String> cmdList = new ArrayList<String>(4);
+               cmdList.add(stringsProg);
+               cmdList.add("-n");
+               cmdList.add("" + config.getMinLength());;
+               // Currently, encoding option is not supported by Windows (and 
other) versions
+               if (STRINGS_PRESENT.get(stringsProg)[1]) {
+                       cmdList.add("-e");
+                       cmdList.add("" + config.getEncoding().get());
+               }
+               cmdList.add(input.getPath());
+               
+               String[] cmd = cmdList.toArray(new String[cmdList.size()]);
+               
+               ProcessBuilder pb = new ProcessBuilder(cmd);
+               final Process process = pb.start();
+
+               InputStream out = process.getInputStream();
+
+               FutureTask<Integer> waitTask = new FutureTask<Integer>(
+                               new Callable<Integer>() {
+                                       public Integer call() throws Exception {
+                                               return process.waitFor();
+                                       }
+                               });
+
+               Thread waitThread = new Thread(waitTask);
+               waitThread.start();
+
+               // Reads content printed out by "strings" command
+               int totalBytes = 0;
+               totalBytes = extractOutput(out, xhtml);         
+
+               try {
+                       waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+               } catch (InterruptedException ie) {
+                       waitThread.interrupt();
+                       process.destroy();
+                       Thread.currentThread().interrupt();
+                       throw new TikaException(StringsParser.class.getName()
+                                       + " interrupted", ie);
+
+               } catch (ExecutionException ee) {
+                       // should not be thrown
+
+               } catch (TimeoutException te) {
+                       waitThread.interrupt();
+                       process.destroy();
+                       throw new TikaException(StringsParser.class.getName() + 
" timeout",
+                                       te);
+               }
+
+               return totalBytes;
+       }
+
+       /**
+        * Extracts ASCII strings using the "strings" command.
+        * 
+        * @param stream
+        *            {@see InputStream} object used for reading the binary 
file.
+        * @param xhtml
+        *            {@see XHTMLContentHandler} object.
+        * @return the total number of bytes read using the "strings" command.
+        * @throws SAXException
+        *             if the content element could not be written.
+        * @throws IOException
+        *             if any I/O error occurs.
+        */
+       private int extractOutput(InputStream stream, XHTMLContentHandler xhtml)
+                       throws SAXException, IOException {
+
+               char[] buffer = new char[1024];
+               int totalBytes = 0;
+
+               try (BufferedReader reader = new BufferedReader(new 
InputStreamReader(stream, UTF_8))) {
+                       int n = 0;
+                       while ((n = reader.read(buffer)) != -1) {
+                               if (n > 0) {
+                                       xhtml.characters(buffer, 0, n);
+                               }
+                               totalBytes += n;
+                       }
+
+               }
+
+               return totalBytes;
+       }
+
+       /**
+        * Runs the "file" command on the given file that aims at providing an
+        * alternative way to determine the file type.
+        * 
+        * @param input
+        *            {@see File} object that represents the file to detect.
+        * @return the file type provided by the "file" command using the "-b"
+        *         option (it stands for "brief mode").
+        * @throws IOException
+        *             if any I/O error occurs.
+        */
+       private String doFile(File input, FileConfig config) throws IOException 
{
+               if (!hasFile(config)) {
+                       return null;
+               }
+               
+               // Builds the command array
+               ArrayList<String> cmdList = new ArrayList<String>(3);
+               cmdList.add(config.getFilePath() + getFileProg());
+               cmdList.add("-b");
+               if (config.isMimetype()) {
+                       cmdList.add("-I");
+               }
+               cmdList.add(input.getPath());
+               
+               String[] cmd = cmdList.toArray(new String[cmdList.size()]);
+
+               ProcessBuilder pb = new ProcessBuilder(cmd);
+               final Process process = pb.start();
+
+               InputStream out = process.getInputStream();
+
+               String fileOutput = null;
+
+               try (BufferedReader reader = new BufferedReader(new 
InputStreamReader(out, UTF_8))) {
+                       fileOutput = reader.readLine();
+               } catch (IOException ioe) {
+                       // file output not available!
+                       fileOutput = "";
+               }
+
+               return fileOutput;
+       }
+
+       
+       public static String getStringsProg() {
+               return System.getProperty("os.name").startsWith("Windows") ? 
"strings.exe" : "strings";
+       }
+       
+       public static String getFileProg() {
+               return System.getProperty("os.name").startsWith("Windows") ? 
"file.exe" : "file";
+       }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,544 @@
+/**
+ * 
******************************************************************************
+ * Copyright (C) 2005-2009, International Business Machines Corporation and    
*
+ * others. All Rights Reserved.                                                
*
+ * 
******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+
+/**
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation.  The detection
+ * process will attempt to identify the charset that best matches the 
characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain 
text
+ * in the language are needed.  The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * @stable ICU 3.4
+ */
+public class CharsetDetector {
+
+//   Question: Should we have getters corresponding to the setters for input 
text
+//   and declared encoding?
+
+//   A thought: If we were to create our own type of Java Reader, we could 
defer
+//   figuring out an actual charset for data that starts out with too much 
English
+//   only ASCII until the user actually read through to something that didn't 
look
+//   like 7 bit English.  If  nothing else ever appeared, we would never need 
to
+//   actually choose the "real" charset.  All assuming that the application 
just
+//   wants the data, and doesn't care about a char set name.
+
+    private static final int kBufSize = 12000;
+    private static final int MAX_CONFIDENCE = 100;
+    private static String[] fCharsetNames;
+    /*
+     * List of recognizers for all charsets known to the implementation.
+     */
+    private static ArrayList<CharsetRecognizer> fCSRecognizers = 
createRecognizers();
+    /*
+     *  The following items are accessed by individual CharsetRecongizers 
during
+     *     the recognition process
+     *
+     */
+    byte[] fInputBytes =       // The text to be checked.  Markup will have 
been
+            new byte[kBufSize];  //   removed if appropriate.
+    int fInputLen;          // Length of the byte data in fInputText.
+    short fByteStats[] =      // byte frequency statistics for the input text.
+            new short[256];  //   Value is percent, not absolute.
+    boolean fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F 
are in the input;
+            false;
+    String fDeclaredEncoding;
+    //
+    //  Stuff private to CharsetDetector
+    //
+    byte[] fRawInput;     // Original, untouched input bytes.
+    //  If user gave us a byte array, this is it.
+    //  If user gave us a stream, it's read to a
+    //  buffer here.
+    int fRawLength;    // Length of data in fRawInput array.
+    InputStream fInputStream;  // User's input stream, or null if the user
+    boolean fStripTags =   // If true, setText() will strip tags from input 
text.
+            false;
+
+    /**
+     *   Constructor
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector() {
+    }
+
+    /**
+     * Get the names of all char sets that can be recognized by the char set 
detector.
+     *
+     * @return an array of the names of all charsets that can be recognized
+     * by the charset detector.
+     *
+     * @stable ICU 3.4
+     */
+    public static String[] getAllDetectableCharsets() {
+        return fCharsetNames;
+    }
+
+    /*
+     * Create the singleton instances of the CharsetRecognizer classes
+     */
+    private static ArrayList<CharsetRecognizer> createRecognizers() {
+        ArrayList<CharsetRecognizer> recognizers = new 
ArrayList<CharsetRecognizer>();
+
+        recognizers.add(new CharsetRecog_UTF8());
+
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
+
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+        recognizers.add(new 
CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
+
+        // Create an array of all charset names, as a side effect.
+        // Needed for the getAllDetectableCharsets() API.
+        String[] charsetNames = new String[recognizers.size()];
+        int out = 0;
+
+        for (CharsetRecognizer recognizer : recognizers) {
+            String name = recognizer.getName();
+
+            if (out == 0 || !name.equals(charsetNames[out - 1])) {
+                charsetNames[out++] = name;
+            }
+        }
+
+        fCharsetNames = new String[out];
+        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+
+        return recognizers;
+    }
+
+    /**
+     * Set the declared encoding for charset detection.
+     *  The declared encoding of an input text is an encoding obtained
+     *  from an http header or xml declaration or similar source that
+     *  can be provided as additional information to the charset detector.
+     *  A match between a declared encoding and a possible detected encoding
+     *  will raise the quality of that detected encoding by a small delta,
+     *  and will also appear as a "reason" for the match.
+     * <p/>
+     * A declared encoding that is incompatible with the input data being
+     * analyzed will not be added to the list of possible encodings.
+     *
+     *  @param encoding The declared encoding
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector setDeclaredEncoding(String encoding) {
+        setCanonicalDeclaredEncoding(encoding);
+        return this;
+    }
+
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     *
+     * @param in the input text of unknown encoding
+     *
+     * @return This CharsetDetector
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector setText(byte[] in) {
+        fRawInput = in;
+        fRawLength = in.length;
+
+        MungeInput();
+
+        return this;
+    }
+    //   Value is rounded up, so zero really means zero occurences.
+
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     *  <p/>
+     *   The input stream that supplies the character data must have 
markSupported()
+     *   == true; the charset detection process will read a small amount of 
data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *   be read depends on the characteristics of the data itself.
+     *
+     * @param in the input text of unknown encoding
+     *
+     * @return This CharsetDetector
+     *
+     * @stable ICU 3.4
+     */
+
+    public CharsetDetector setText(InputStream in) throws IOException {
+        fInputStream = in;
+        fInputStream.mark(kBufSize);
+        fRawInput = new byte[kBufSize];   // Always make a new buffer because 
the
+        //   previous one may have come from the caller,
+        //   in which case we can't touch it.
+        fRawLength = 0;
+        int remainingLength = kBufSize;
+        while (remainingLength > 0) {
+            // read() may give data in smallish chunks, esp. for remote 
sources.  Hence, this loop.
+            int bytesRead = fInputStream.read(fRawInput, fRawLength, 
remainingLength);
+            if (bytesRead <= 0) {
+                break;
+            }
+            fRawLength += bytesRead;
+            remainingLength -= bytesRead;
+        }
+        fInputStream.reset();
+
+        MungeInput();                     // Strip html markup, collect byte 
stats.
+        return this;
+    }
+
+    /**
+     * Return the charset that best matches the supplied input data.
+     *
+     * Note though, that because the detection
+     * only looks at the start of the input data,
+     * there is a possibility that the returned charset will fail to handle
+     * the full set of input data.
+     * <p/>
+     * Raise an exception if
+     *  <ul>
+     *    <li>no charset appears to match the data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     *
+     * @return a CharsetMatch object representing the best matching charset, or
+     *         <code>null</code> if there are no matches.
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetMatch detect() {
+//   TODO:  A better implementation would be to copy the detect loop from
+//          detectAll(), and cut it short as soon as a match with a high 
confidence
+//          is found.  This is something to be done later, after things are 
otherwise
+//          working.
+        CharsetMatch matches[] = detectAll();
+
+        if (matches == null || matches.length == 0) {
+            return null;
+        }
+
+        return matches[0];
+    }
+
+    /**
+     *  Return an array of all charsets that appear to be plausible
+     *  matches with the input data.  The array is ordered with the
+     *  best quality match first.
+     * <p/>
+     * Raise an exception if
+     *  <ul>
+     *    <li>no charsets appear to match the input data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     *
+     * @return An array of CharsetMatch objects representing possibly matching 
charsets.
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetMatch[] detectAll() {
+        CharsetRecognizer csr;
+        int i;
+        int detectResults;
+        int confidence;
+        ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+
+        //  Iterate over all possible charsets, remember all that
+        //    give a match quality > 0.
+        for (i = 0; i < fCSRecognizers.size(); i++) {
+            csr = fCSRecognizers.get(i);
+            detectResults = csr.match(this);
+            confidence = detectResults & 0x000000ff;
+            if (confidence > 0) {
+                // Just to be safe, constrain
+                confidence = Math.min(confidence, MAX_CONFIDENCE);
+
+                // Apply charset hint.
+                if ((fDeclaredEncoding != null) && 
(fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
+                    // Reduce lack of confidence (delta between "sure" and 
current) by 50%.
+                    confidence += (MAX_CONFIDENCE - confidence) / 2;
+                }
+
+                CharsetMatch m = new CharsetMatch(this, csr, confidence);
+                matches.add(m);
+            }
+        }
+
+        Collections.sort(matches);      // CharsetMatch compares on confidence
+        Collections.reverse(matches);   //  Put best match first.
+        CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
+        resultArray = matches.toArray(resultArray);
+        return resultArray;
+    }
+
+    /**
+     * Autodetect the charset of an inputStream, and return a Java Reader
+     * to access the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   
<code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+     * <p/>
+     *   For the input stream that supplies the character data, markSupported()
+     *   must be true; the  charset detection will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *    be read depends on the characteristics of the data itself.
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     *
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     *
+     * @stable ICU 3.4
+     */
+    public Reader getReader(InputStream in, String declaredEncoding) {
+        setCanonicalDeclaredEncoding(declaredEncoding);
+
+        try {
+            setText(in);
+
+            CharsetMatch match = detect();
+
+            if (match == null) {
+                return null;
+            }
+
+            return match.getReader();
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    /**
+     * Autodetect the charset of an inputStream, and return a String
+     * containing the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   
<code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     *
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     *
+     * @stable ICU 3.4
+     */
+    public String getString(byte[] in, String declaredEncoding) {
+        setCanonicalDeclaredEncoding(declaredEncoding);
+
+        try {
+            setText(in);
+
+            CharsetMatch match = detect();
+
+            if (match == null) {
+                return null;
+            }
+
+            return match.getString(-1);
+        } catch (IOException e) {
+            return null;
+        }
+    }
+    //   gave us a byte array.
+
+    /**
+     * Test whether or not input filtering is enabled.
+     *
+     * @return <code>true</code> if input text will be filtered.
+     *
+     * @see #enableInputFilter
+     *
+     * @stable ICU 3.4
+     */
+    public boolean inputFilterEnabled() {
+        return fStripTags;
+    }
+
+    /**
+     * Enable filtering of input text. If filtering is enabled,
+     * text within angle brackets ("<" and ">") will be removed
+     * before detection.
+     *
+     * @param filter <code>true</code> to enable input text filtering.
+     *
+     * @return The previous setting.
+     *
+     * @stable ICU 3.4
+     */
+    public boolean enableInputFilter(boolean filter) {
+        boolean previous = fStripTags;
+
+        fStripTags = filter;
+
+        return previous;
+    }
+
+    /**
+     * Try to set fDeclaredEncoding to the canonical name for <encoding>, if 
it exists.
+     *
+     * @param encoding - name of character encoding
+     */
+    private void setCanonicalDeclaredEncoding(String encoding) {
+        if ((encoding == null) || encoding.isEmpty()) {
+            return;
+        }
+
+        Charset cs = Charset.forName(encoding);
+        if (cs != null) {
+            fDeclaredEncoding = cs.name();
+        }
+    }
+
+    /*
+     *  MungeInput - after getting a set of raw input data to be analyzed, 
preprocess
+     *               it by removing what appears to be html markup.
+     */
+    private void MungeInput() {
+        int srci = 0;
+        int dsti = 0;
+        byte b;
+        boolean inMarkup = false;
+        int openTags = 0;
+        int badTags = 0;
+
+        //
+        //  html / xml markup stripping.
+        //     quick and dirty, not 100% accurate, but hopefully good enough, 
statistically.
+        //     discard everything within < brackets >
+        //     Count how many total '<' and illegal (nested) '<' occur, so we 
can make some
+        //     guess as to whether the input was actually marked up at all.
+        if (fStripTags) {
+            for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; 
srci++) {
+                b = fRawInput[srci];
+                if (b == (byte) '<') {
+                    if (inMarkup) {
+                        badTags++;
+                    }
+                    inMarkup = true;
+                    openTags++;
+                }
+
+                if (!inMarkup) {
+                    fInputBytes[dsti++] = b;
+                }
+
+                if (b == (byte) '>') {
+                    inMarkup = false;
+                }
+            }
+
+            fInputLen = dsti;
+        }
+
+        //
+        //  If it looks like this input wasn't marked up, or if it looks like 
it's
+        //    essentially nothing but markup abandon the markup stripping.
+        //    Detection will have to work on the unstripped input.
+        //
+        if (openTags < 5 || openTags / 5 < badTags ||
+                (fInputLen < 100 && fRawLength > 600)) {
+            int limit = fRawLength;
+
+            if (limit > kBufSize) {
+                limit = kBufSize;
+            }
+
+            for (srci = 0; srci < limit; srci++) {
+                fInputBytes[srci] = fRawInput[srci];
+            }
+            fInputLen = srci;
+        }
+
+        //
+        // Tally up the byte occurence statistics.
+        //   These are available for use by the various detectors.
+        //
+        Arrays.fill(fByteStats, (short) 0);
+        for (srci = 0; srci < fInputLen; srci++) {
+            int val = fInputBytes[srci] & 0x00ff;
+            fByteStats[val]++;
+        }
+
+        fC1Bytes = false;
+        for (int i = 0x80; i <= 0x9F; i += 1) {
+            if (fByteStats[i] != 0) {
+                fC1Bytes = true;
+                break;
+            }
+        }
+    }
+}

svn commit: r1723223 [25/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Reply via email to