Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.geo.topic; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import org.junit.Test; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.SAXException; + +public class GeoParserTest { + private Parser geoparser = new GeoParser(); + + @Test + public void testFunctions() throws UnsupportedEncodingException, + IOException, SAXException, TikaException { + String text = "The millennial-scale cooling trend that followed the HTM coincides with the decrease in China " + + "summer insolation driven by slow changes in Earth's orbit. Despite the nearly linear forcing, the transition from the HTM to " + + "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand how feedbacks and perturbations result in rapid changes, " + + "a geographically distributed network of United States proxy climate records was examined to study the spatial and temporal patterns of change, and to " + + "quantify the magnitude of change during these transitions. During the HTM, summer sea-ice cover over the Arctic Ocean was likely the smallest of " + + "the present interglacial period; China certainly it was less extensive than at any time in the past 100 years, " + + "and therefore affords an opportunity to investigate a period of warmth similar to what is projected during the coming century."; + + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + GeoParserConfig config = new GeoParserConfig(); + context.set(GeoParserConfig.class, config); + + InputStream s = new ByteArrayInputStream(text.getBytes(UTF_8)); + /* if it's not available no tests to run */ + if (!((GeoParser) geoparser).isAvailable()) + return; + + geoparser.parse(s, new BodyContentHandler(), metadata, context); + + assertNotNull(metadata.get("Geographic_NAME")); + assertNotNull(metadata.get("Geographic_LONGITUDE")); + assertNotNull(metadata.get("Geographic_LATITUDE")); + assertEquals("China", metadata.get("Geographic_NAME")); + assertEquals("United States", metadata.get("Optional_NAME1")); + assertEquals("27.33931", metadata.get("Geographic_LATITUDE")); + assertEquals("-108.60288", metadata.get("Geographic_LONGITUDE")); + assertEquals("39.76", metadata.get("Optional_LATITUDE1")); + assertEquals("-98.5", metadata.get("Optional_LONGITUDE1")); + + } + + @Test + public void testNulls() throws UnsupportedEncodingException, IOException, + SAXException, TikaException { + String text = ""; + + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + GeoParserConfig config = new GeoParserConfig(); + context.set(GeoParserConfig.class, config); + geoparser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), + new BodyContentHandler(), metadata, context); + assertNull(metadata.get("Geographic_NAME")); + assertNull(metadata.get("Geographic_LONGITUDE")); + assertNull(metadata.get("Geographic_LATITUDE")); + + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.geoinfo; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.geoinfo.GeographicInformationParser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import java.io.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + + +public class GeographicInformationParserTest { + + @Test + public void testISO19139() throws Exception{ + String path ="/test-documents/sampleFile.iso19139"; + + Metadata metadata = new Metadata(); + Parser parser=new org.apache.tika.parser.geoinfo.GeographicInformationParser(); + ContentHandler contentHandler=new BodyContentHandler(); + ParseContext parseContext=new ParseContext(); + + InputStream inputStream = GeographicInformationParser.class.getResourceAsStream(path); + + parser.parse(inputStream, contentHandler, metadata, parseContext); + + assertEquals("text/iso19139+xml", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("UTF-8", metadata.get("CharacterSet")); + assertEquals("https", metadata.get("TransferOptionsOnlineProtocol ")); + assertEquals("browser", metadata.get("TransferOptionsOnlineProfile ")); + assertEquals("Barrow Atqasuk ARCSS Plant", metadata.get("TransferOptionsOnlineName ")); + + String content = contentHandler.toString(); + assertTrue(content.contains("Barrow Atqasuk ARCSS Plant")); + assertTrue(content.contains("GeographicElementWestBoundLatitude -157.24")); + assertTrue(content.contains("GeographicElementEastBoundLatitude -156.4")); + assertTrue(content.contains("GeographicElementNorthBoundLatitude 71.18")); + assertTrue(content.contains("GeographicElementSouthBoundLatitude 70.27")); + + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.grib; + +//JDK imports +import static org.junit.Assert.*; +import java.io.InputStream; + +//TIKA imports +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import java.io.File; +/** + * Test cases to exercise the {@link org.apache.tika.parser.grib.GribParser}. + */ + +public class GribParserTest { + + @Test + public void testParseGlobalMetadata() throws Exception { + Parser parser = new GribParser(); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + try (InputStream stream = GribParser.class.getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + assertNotNull(metadata); + String content = handler.toString(); + assertTrue(content.contains("dimensions:")); + assertTrue(content.contains("variables:")); + } +} + Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.hdf; + +//JDK imports +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import java.io.InputStream; + + + + +//TIKA imports +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.hdf.HDFParser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * + * Test suite for the {@link HDFParser}. + * + */ +public class HDFParserTest { + + @Test + public void testParseGlobalMetadata() throws Exception { + if(System.getProperty("java.version").startsWith("1.5")) { + return; + } + Parser parser = new HDFParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + /* + * this is a publicly available HDF5 file from the MLS mission: + * + * + * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009 + * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5 + */ + try (InputStream stream = HDFParser.class.getResourceAsStream("/test-documents/test.he5")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + assertNotNull(metadata); + assertEquals("5", metadata.get("GranuleMonth")); + } + + @Test + public void testHDF4() throws Exception { + if(System.getProperty("java.version").startsWith("1.5")) { + return; + } + Parser parser = new HDFParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + /* + * this is a publicly available HDF4 file from the HD4 examples: + * + * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf + */ + try (InputStream stream = HDFParser.class.getResourceAsStream("/test-documents/test.hdf")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + assertNotNull(metadata); + assertEquals("Direct read of HDF4 file through CDM library", metadata.get("_History")); + assertEquals("Ascending", metadata.get("Pass")); + assertEquals("Hierarchical Data Format, version 4", + metadata.get("File-Type-Description")); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.isatab; + +import static org.junit.Assert.*; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class ISArchiveParserTest { + + @Test + public void testParseArchive() throws Exception { + String path = "/test-documents/testISATab_BII-I-1/s_BII-S-1.txt"; + + Parser parser = new ISArchiveParser(ISArchiveParserTest.class.getResource("/test-documents/testISATab_BII-I-1/").toURI().getPath()); + //Parser parser = new AutoDetectParser(); + + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + try (InputStream stream = ISArchiveParserTest.class.getResourceAsStream(path)) { + parser.parse(stream, handler, metadata, context); + } + + // INVESTIGATION + assertEquals("Invalid Investigation Identifier", "BII-I-1", metadata.get("Investigation Identifier")); + assertEquals("Invalid Investigation Title", "Growth control of the eukaryote cell: a systems biology study in yeast", metadata.get("Investigation Title")); + + // INVESTIGATION PUBLICATIONS + assertEquals("Invalid Investigation PubMed ID", "17439666", metadata.get("Investigation PubMed ID")); + assertEquals("Invalid Investigation Publication DOI", "doi:10.1186/jbiol54", metadata.get("Investigation Publication DOI")); + + // INVESTIGATION CONTACTS + assertEquals("Invalid Investigation Person Last Name", "Oliver", metadata.get("Investigation Person Last Name")); + assertEquals("Invalid Investigation Person First Name", "Stephen", metadata.get("Investigation Person First Name")); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mat; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.Test; + +/** + * Test cases to exercise the {@link MatParser}. + */ +public class MatParserTest { + @Test + public void testParser() throws Exception { + AutoDetectParser parser = new AutoDetectParser(); + ToXMLContentHandler handler = new ToXMLContentHandler(); + Metadata metadata = new Metadata(); + String path = "/test-documents/breidamerkurjokull_radar_profiles_2009.mat"; + + try (InputStream stream = MatParser.class.getResourceAsStream(path)) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + // Check Metadata + assertEquals("PCWIN64", metadata.get("platform")); + assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType")); + assertEquals("IM", metadata.get("endian")); + assertEquals("Thu Feb 21 15:52:49 2013", metadata.get("createdOn")); + + // Check Content + String content = handler.toString(); + + assertContains("<li>[1x909 double array]</li>", content); + assertContains("<p>c1:[1x1 struct array]</p>", content); + assertContains("<li>[1024x1 double array]</li>", content); + assertContains("<p>b1:[1x1 struct array]</p>", content); + assertContains("<p>a1:[1x1 struct array]</p>", content); + assertContains("<li>[1024x1261 double array]</li>", content); + assertContains("<li>[1x1 double array]</li>", content); + assertContains("</body></html>", content); + } + + @Test + public void testParserForText() throws Exception { + Parser parser = new MatParser(); + ToXMLContentHandler handler = new ToXMLContentHandler(); + Metadata metadata = new Metadata(); + String path = "/test-documents/test_mat_text.mat"; + + try (InputStream stream = MatParser.class.getResourceAsStream(path)) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + // Check Content + String content = handler.toString(); + assertContains("<p>double:[2x2 double array]</p>", content); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.netcdf; + +//JDK imports +import java.io.InputStream; + +//TIKA imports +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +/** + * Test cases to exercise the {@link NetCDFParser}. + */ +public class NetCDFParserTest { + + @Test + public void testParseGlobalMetadata() throws Exception { + Parser parser = new NetCDFParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = NetCDFParser.class + .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + assertEquals(metadata.get(TikaCoreProperties.TITLE), + "model output prepared for IPCC AR4"); + assertEquals(metadata.get(Metadata.CONTACT), "[email protected]"); + assertEquals(metadata.get(Metadata.PROJECT_ID), + "IPCC Fourth Assessment"); + assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0"); + assertEquals(metadata.get(Metadata.REALIZATION), "1"); + assertEquals(metadata.get(Metadata.EXPERIMENT_ID), + "720 ppm stabilization experiment (SRESA1B)"); + assertEquals(metadata.get("File-Type-Description"), + "NetCDF-3/CDM"); + + String content = handler.toString(); + assertContains("long_name = \"Surface area\"", content); + assertContains("float area(lat=128, lon=256)", content); + assertContains("float lat(lat=128)", content); + assertContains("double lat_bnds(lat=128, bnds=2)", content); + assertContains("double lon_bnds(lon=256, bnds=2)", content); + + + + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml Wed Jan 6 03:50:50 2016 @@ -0,0 +1,75 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-text-module</artifactId> + <name>Apache Tika Text Module</name> + <url>http://tika.apache.org/</url> + + <properties> + <commons.logging.version>1.1.3</commons.logging.version> + </properties> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>com.googlecode.juniversalchardet</groupId> + <artifactId>juniversalchardet</artifactId> + <version>1.0.3</version> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> + </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>${codec.version}</version> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>${commons.logging.version}</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE Wed Jan 6 03:50:50 2016 @@ -0,0 +1,37 @@ +APACHE TIKA SUBCOMPONENTS + +Apache Tika includes a number of subcomponents with separate copyright notices +and license terms. Your use of these subcomponents is subject to the terms and +conditions of the following licenses. + +Charset detection code from ICU4J (http://site.icu-project.org/) + + Copyright (c) 1995-2009 International Business Machines Corporation + and others + + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, and/or sell copies of the Software, and to permit persons + to whom the Software is furnished to do so, provided that the above + copyright notice(s) and this permission notice appear in all copies + of the Software and that both the above copyright notice(s) and this + permission notice appear in supporting documentation. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE + BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, + OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, + ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + SOFTWARE. + + Except as contained in this notice, the name of a copyright holder shall + not be used in advertising or otherwise to promote the sale, use or other + dealings in this Software without prior written authorization of the + copyright holder. Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +import java.io.Serializable; + +/** + * Configuration for the "file" (or file-alternative) command. + * + */ +public class FileConfig implements Serializable { + /** + * Serial version UID + */ + private static final long serialVersionUID = 5712655467296441314L; + + private String filePath = ""; + + private boolean mimetype = false; + + /** + * Default constructor. + */ + public FileConfig() { + // TODO Loads properties from InputStream. + } + + /** + * Returns the "file" installation folder. + * + * @return the "file" installation folder. + */ + public String getFilePath() { + return filePath; + } + + /** + * Sets the "file" installation folder. + * + * @param path + * the "file" installation folder. + */ + public void setFilePath(String filePath) { + this.filePath = filePath; + } + + /** + * Returns {@code true} if the mime option is enabled. + * + * @return {@code true} if the mime option is enabled, {@code} otherwise. + */ + public boolean isMimetype() { + return mimetype; + } + + /** + * Sets the mime option. If {@code true}, it causes the file command to + * output mime type strings rather than the more traditional human readable + * ones. + * + * @param mimetype + */ + public void setMimetype(boolean mimetype) { + this.mimetype = mimetype; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,322 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser to extract printable Latin1 strings from arbitrary files with pure + * java. Useful for binary or unknown files, for files without a specific parser + * and for corrupted ones causing a TikaException as a fallback parser. + * + * Currently the parser does a best effort to extract Latin1 strings, used by + * Western European languages, encoded with ISO-8859-1, UTF-8 or UTF-16 charsets + * within the same file. + * + * The implementation is optimized for fast parsing with only one pass. + */ +public class Latin1StringsParser extends AbstractParser { + + private static final long serialVersionUID = 1L; + + /** + * The set of supported types + */ + private static final Set<MediaType> SUPPORTED_TYPES = getTypes(); + + /** + * The valid ISO-8859-1 character map. + */ + private static final boolean[] isChar = getCharMap(); + + /** + * The size of the internal buffers. + */ + private static int BUF_SIZE = 64 * 1024; + + /** + * The minimum size of a character sequence to be extracted. + */ + private int minSize = 4; + + /** + * The output buffer. + */ + private byte[] output = new byte[BUF_SIZE]; + + /** + * The input buffer. + */ + private byte[] input = new byte[BUF_SIZE]; + + /** + * The temporary position into the output buffer. + */ + private int tmpPos = 0; + + /** + * The current position into the output buffer. + */ + private int outPos = 0; + + /** + * The number of bytes into the input buffer. + */ + private int inSize = 0; + + /** + * The position into the input buffer. + */ + private int inPos = 0; + + /** + * The output content handler. + */ + private XHTMLContentHandler xhtml; + + /** + * Returns the minimum size of a character sequence to be extracted. + * + * @return the minimum size of a character sequence + */ + public int getMinSize() { + return minSize; + } + + /** + * Sets the minimum size of a character sequence to be extracted. + * + * @param minSize + * the minimum size of a character sequence + */ + public void setMinSize(int minSize) { + this.minSize = minSize; + } + + /** + * Populates the valid ISO-8859-1 character map. + * + * @return the valid ISO-8859-1 character map. + */ + private static boolean[] getCharMap() { + + boolean[] isChar = new boolean[256]; + for (int c = Byte.MIN_VALUE; c <= Byte.MAX_VALUE; c++) + if ((c >= 0x20 && c <= 0x7E) + || (c >= (byte) 0xC0 && c <= (byte) 0xFE) || c == 0x0A + || c == 0x0D || c == 0x09) { + isChar[c & 0xFF] = true; + } + return isChar; + + } + + /** + * Returns the set of supported types. + * + * @return the set of supported types + */ + private static Set<MediaType> getTypes() { + HashSet<MediaType> supportedTypes = new HashSet<MediaType>(); + supportedTypes.add(MediaType.OCTET_STREAM); + return supportedTypes; + } + + /** + * Tests if the byte is a ISO-8859-1 char. + * + * @param c + * the byte to test. + * + * @return if the byte is a char. + */ + private static final boolean isChar(byte c) { + return isChar[c & 0xFF]; + } + + /** + * Flushes the internal output buffer to the content handler. + * + * @throws UnsupportedEncodingException + * @throws SAXException + */ + private void flushBuffer() throws UnsupportedEncodingException, + SAXException { + if (tmpPos - outPos >= minSize) + outPos = tmpPos - minSize; + + xhtml.characters(new String(output, 0, outPos, "windows-1252")); + + for (int k = 0; k < tmpPos - outPos; k++) + output[k] = output[outPos + k]; + tmpPos = tmpPos - outPos; + outPos = 0; + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext arg0) { + return SUPPORTED_TYPES; + } + + /** + * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, + * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, + * org.apache.tika.parser.ParseContext) + */ + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException { + /* + * Creates a new instance because the object is not immutable. + */ + new Latin1StringsParser().doParse(stream, handler, metadata, context); + } + + /** + * Does a best effort to extract Latin1 strings encoded with ISO-8859-1, + * UTF-8 or UTF-16. Valid chars are saved into the output buffer and the + * temporary buffer position is incremented. When an invalid char is read, + * the difference of the temporary and current buffer position is checked. + * If it is greater than the minimum string size, the current buffer + * position is updated to the temp position. If it is not, the temp position + * is reseted to the current position. + * + * @param stream + * the input stream. + * @param handler + * the output content handler + * @param metadata + * the metadata of the file + * @param context + * the parsing context + * @throws IOException + * if an io error occurs + * @throws SAXException + * if a sax error occurs + */ + private void doParse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException { + + tmpPos = 0; + outPos = 0; + + xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + int i = 0; + do { + inSize = 0; + while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) { + inSize += i; + } + inPos = 0; + while (inPos < inSize) { + byte c = input[inPos++]; + boolean utf8 = false; + /* + * Test for a possible UTF8 encoded char + */ + if (c == (byte) 0xC3) { + byte c_ = inPos < inSize ? input[inPos++] : (byte) stream + .read(); + /* + * Test if the next byte is in the valid UTF8 range + */ + if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) { + utf8 = true; + output[tmpPos++] = (byte) (c_ + 0x40); + } else { + output[tmpPos++] = c; + c = c_; + } + if (tmpPos == BUF_SIZE) + flushBuffer(); + + /* + * Test for a possible UTF8 encoded char + */ + } else if (c == (byte) 0xC2) { + byte c_ = inPos < inSize ? input[inPos++] : (byte) stream + .read(); + /* + * Test if the next byte is in the valid UTF8 range + */ + if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) { + utf8 = true; + output[tmpPos++] = c_; + } else { + output[tmpPos++] = c; + c = c_; + } + if (tmpPos == BUF_SIZE) + flushBuffer(); + } + if (!utf8) + /* + * Test if the byte is a valid char. + */ + if (isChar(c)) { + output[tmpPos++] = c; + if (tmpPos == BUF_SIZE) + flushBuffer(); + } else { + /* + * Test if the byte is an invalid char, marking a string + * end. If it is a zero, test 2 positions before or + * ahead for a valid char, meaning it marks the + * transition between ISO-8859-1 and UTF16 sequences. + */ + if (c != 0 + || (inPos >= 3 && isChar(input[inPos - 3])) + || (inPos + 1 < inSize && isChar(input[inPos + 1]))) { + + if (tmpPos - outPos >= minSize) { + output[tmpPos++] = 0x0A; + outPos = tmpPos; + + if (tmpPos == BUF_SIZE) + flushBuffer(); + } else + tmpPos = outPos; + + } + } + } + } while (i != -1 && !Thread.currentThread().isInterrupted()); + + if (tmpPos - outPos >= minSize) { + output[tmpPos++] = 0x0A; + outPos = tmpPos; + } + xhtml.characters(new String(output, 0, outPos, "windows-1252")); + + xhtml.endDocument(); + + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,187 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +import java.io.File; +import java.io.Serializable; +import java.util.Properties; +import java.io.InputStream; +import java.io.IOException; + +/** + * Configuration for the "strings" (or strings-alternative) command. + * + */ +public class StringsConfig implements Serializable { + /** + * Serial version UID + */ + private static final long serialVersionUID = -1465227101645003594L; + + private String stringsPath = ""; + + // Minimum sequence length (characters) to print + private int minLength = 4; + + // Character encoding of the strings that are to be found + private StringsEncoding encoding = StringsEncoding.SINGLE_7_BIT; + + // Maximum time (seconds) to wait for the strings process termination + private int timeout = 120; + + /** + * Default contructor. + */ + public StringsConfig() { + init(this.getClass().getResourceAsStream("Strings.properties")); + } + + /** + * Loads properties from InputStream and then tries to close InputStream. If + * there is an IOException, this silently swallows the exception and goes + * back to the default. + * + * @param is + */ + public StringsConfig(InputStream is) { + init(is); + } + + /** + * Initializes attributes. + * + * @param is + */ + private void init(InputStream is) { + if (is == null) { + return; + } + Properties props = new Properties(); + try { + props.load(is); + } catch (IOException e) { + // swallow + } finally { + if (is != null) { + try { + is.close(); + } catch (IOException e) { + // swallow + } + } + } + + setStringsPath(props.getProperty("stringsPath", "" + getStringsPath())); + + setMinLength(Integer.parseInt(props.getProperty("minLength", "" + + getMinLength()))); + + setEncoding(StringsEncoding.valueOf(props.getProperty("encoding", "" + + getEncoding().get()))); + + setTimeout(Integer.parseInt(props.getProperty("timeout", "" + + getTimeout()))); + } + + /** + * Returns the "strings" installation folder. + * + * @return the "strings" installation folder. + */ + public String getStringsPath() { + return this.stringsPath; + } + + /** + * Returns the minimum sequence length (characters) to print. + * + * @return the minimum sequence length (characters) to print. + */ + public int getMinLength() { + return this.minLength; + } + + /** + * Returns the character encoding of the strings that are to be found. + * + * @return {@see StringsEncoding} enum that represents the character + * encoding of the strings that are to be found. + */ + public StringsEncoding getEncoding() { + return this.encoding; + } + + /** + * Returns the maximum time (in seconds) to wait for the "strings" command + * to terminate. + * + * @return the maximum time (in seconds) to wait for the "strings" command + * to terminate. + */ + public int getTimeout() { + return this.timeout; + } + + /** + * Sets the "strings" installation folder. + * + * @param path + * the "strings" installation folder. + */ + public void setStringsPath(String path) { + if (!path.isEmpty() && !path.endsWith(File.separator)) { + path += File.separatorChar; + } + this.stringsPath = path; + } + + /** + * Sets the minimum sequence length (characters) to print. + * + * @param minLength + * the minimum sequence length (characters) to print. + */ + public void setMinLength(int minLength) { + if (minLength < 1) { + throw new IllegalArgumentException("Invalid minimum length"); + } + this.minLength = minLength; + } + + /** + * Sets the character encoding of the strings that are to be found. + * + * @param encoding + * {@see StringsEncoding} enum that represents the character + * encoding of the strings that are to be found. + */ + public void setEncoding(StringsEncoding encoding) { + this.encoding = encoding; + } + + /** + * Sets the maximum time (in seconds) to wait for the "strings" command to + * terminate. + * + * @param timeout + * the maximum time (in seconds) to wait for the "strings" + * command to terminate. + */ + public void setTimeout(int timeout) { + if (timeout < 1) { + throw new IllegalArgumentException("Invalid timeout"); + } + this.timeout = timeout; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +/** + * Character encoding of the strings that are to be found using the "strings" command. + * + */ +public enum StringsEncoding { + SINGLE_7_BIT('s', "single-7-bit-byte"), // default + SINGLE_8_BIT('S', "single-8-bit-byte"), + BIGENDIAN_16_BIT('b', "16-bit bigendian"), + LITTLEENDIAN_16_BIT('l', "16-bit littleendian"), + BIGENDIAN_32_BIT('B', "32-bit bigendian"), + LITTLEENDIAN_32_BIT('L', "32-bit littleendian"); + + private char value; + + private String encoding; + + private StringsEncoding(char value, String encoding) { + this.value = value; + this.encoding = encoding; + } + + public char get() { + return value; + } + + @Override + public String toString() { + return encoding; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,335 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.strings; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.FutureTask; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Parser that uses the "strings" (or strings-alternative) command to find the + * printable strings in a object, or other binary, file + * (application/octet-stream). Useful as "best-effort" parser for files detected + * as application/octet-stream. + * + * @author gtotaro + * + */ +public class StringsParser extends AbstractParser { + /** + * Serial version UID + */ + private static final long serialVersionUID = 802566634661575025L; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .singleton(MediaType.OCTET_STREAM); + + private static final StringsConfig DEFAULT_STRINGS_CONFIG = new StringsConfig(); + + private static final FileConfig DEFAULT_FILE_CONFIG = new FileConfig(); + + /* + * This map is organized as follows: + * command's pathname (String) -> is it present? (Boolean), does it support -e option? (Boolean) + * It stores check results for command and, if present, -e (encoding) option. + */ + private static Map<String,Boolean[]> STRINGS_PRESENT = new HashMap<String, Boolean[]>(); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + StringsConfig stringsConfig = context.get(StringsConfig.class, DEFAULT_STRINGS_CONFIG); + FileConfig fileConfig = context.get(FileConfig.class, DEFAULT_FILE_CONFIG); + + if (!hasStrings(stringsConfig)) { + return; + } + + TikaInputStream tis = TikaInputStream.get(stream); + File input = tis.getFile(); + + // Metadata + metadata.set("strings:min-len", "" + stringsConfig.getMinLength()); + metadata.set("strings:encoding", stringsConfig.toString()); + metadata.set("strings:file_output", doFile(input, fileConfig)); + + int totalBytes = 0; + + // Content + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + + xhtml.startDocument(); + + totalBytes = doStrings(input, stringsConfig, xhtml); + + xhtml.endDocument(); + + // Metadata + metadata.set("strings:length", "" + totalBytes); + } + + /** + * Checks if the "strings" command is supported. + * + * @param config + * {@see StringsConfig} object used for testing the strings + * command. + * @return Returns returns {@code true} if the strings command is supported. + */ + private boolean hasStrings(StringsConfig config) { + String stringsProg = config.getStringsPath() + getStringsProg(); + + if (STRINGS_PRESENT.containsKey(stringsProg)) { + return STRINGS_PRESENT.get(stringsProg)[0]; + } + + String[] checkCmd = { stringsProg, "--version" }; + try { + boolean hasStrings = ExternalParser.check(checkCmd); + + boolean encodingOpt = false; + + // Check if the -e option (encoding) is supported + if (!System.getProperty("os.name").startsWith("Windows")) { + String[] checkOpt = {stringsProg, "-e", "" + config.getEncoding().get(), "/dev/null"}; + int[] errorValues = {1, 2}; // Exit status code: 1 = general error; 2 = incorrect usage. + encodingOpt = ExternalParser.check(checkOpt, errorValues); + } + + Boolean[] values = {hasStrings, encodingOpt}; + STRINGS_PRESENT.put(stringsProg, values); + + return hasStrings; + } catch (NoClassDefFoundError ncdfe) { + // This happens under OSGi + Fork Parser - see TIKA-1507 + // As a workaround for now, just say we can't use strings + // TODO Resolve it so we don't need this try/catch block + Boolean[] values = {false, false}; + STRINGS_PRESENT.put(stringsProg, values); + return false; + } + } + + /** + * Checks if the "file" command is supported. + * + * @param config + * @return + */ + private boolean hasFile(FileConfig config) { + String fileProg = config.getFilePath() + getFileProg(); + + String[] checkCmd = { fileProg, "--version" }; + + boolean hasFile = ExternalParser.check(checkCmd); + + return hasFile; + } + + /** + * Runs the "strings" command on the given file. + * + * @param input + * {@see File} object that represents the file to parse. + * @param config + * {@see StringsConfig} object including the strings + * configuration. + * @param xhtml + * {@see XHTMLContentHandler} object. + * @return the total number of bytes read using the strings command. + * @throws IOException + * if any I/O error occurs. + * @throws TikaException + * if the parsing process has been interrupted. + * @throws SAXException + */ + private int doStrings(File input, StringsConfig config, + XHTMLContentHandler xhtml) throws IOException, TikaException, + SAXException { + + String stringsProg = config.getStringsPath() + getStringsProg(); + + // Builds the command array + ArrayList<String> cmdList = new ArrayList<String>(4); + cmdList.add(stringsProg); + cmdList.add("-n"); + cmdList.add("" + config.getMinLength());; + // Currently, encoding option is not supported by Windows (and other) versions + if (STRINGS_PRESENT.get(stringsProg)[1]) { + cmdList.add("-e"); + cmdList.add("" + config.getEncoding().get()); + } + cmdList.add(input.getPath()); + + String[] cmd = cmdList.toArray(new String[cmdList.size()]); + + ProcessBuilder pb = new ProcessBuilder(cmd); + final Process process = pb.start(); + + InputStream out = process.getInputStream(); + + FutureTask<Integer> waitTask = new FutureTask<Integer>( + new Callable<Integer>() { + public Integer call() throws Exception { + return process.waitFor(); + } + }); + + Thread waitThread = new Thread(waitTask); + waitThread.start(); + + // Reads content printed out by "strings" command + int totalBytes = 0; + totalBytes = extractOutput(out, xhtml); + + try { + waitTask.get(config.getTimeout(), TimeUnit.SECONDS); + + } catch (InterruptedException ie) { + waitThread.interrupt(); + process.destroy(); + Thread.currentThread().interrupt(); + throw new TikaException(StringsParser.class.getName() + + " interrupted", ie); + + } catch (ExecutionException ee) { + // should not be thrown + + } catch (TimeoutException te) { + waitThread.interrupt(); + process.destroy(); + throw new TikaException(StringsParser.class.getName() + " timeout", + te); + } + + return totalBytes; + } + + /** + * Extracts ASCII strings using the "strings" command. + * + * @param stream + * {@see InputStream} object used for reading the binary file. + * @param xhtml + * {@see XHTMLContentHandler} object. + * @return the total number of bytes read using the "strings" command. + * @throws SAXException + * if the content element could not be written. + * @throws IOException + * if any I/O error occurs. + */ + private int extractOutput(InputStream stream, XHTMLContentHandler xhtml) + throws SAXException, IOException { + + char[] buffer = new char[1024]; + int totalBytes = 0; + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream, UTF_8))) { + int n = 0; + while ((n = reader.read(buffer)) != -1) { + if (n > 0) { + xhtml.characters(buffer, 0, n); + } + totalBytes += n; + } + + } + + return totalBytes; + } + + /** + * Runs the "file" command on the given file that aims at providing an + * alternative way to determine the file type. + * + * @param input + * {@see File} object that represents the file to detect. + * @return the file type provided by the "file" command using the "-b" + * option (it stands for "brief mode"). + * @throws IOException + * if any I/O error occurs. + */ + private String doFile(File input, FileConfig config) throws IOException { + if (!hasFile(config)) { + return null; + } + + // Builds the command array + ArrayList<String> cmdList = new ArrayList<String>(3); + cmdList.add(config.getFilePath() + getFileProg()); + cmdList.add("-b"); + if (config.isMimetype()) { + cmdList.add("-I"); + } + cmdList.add(input.getPath()); + + String[] cmd = cmdList.toArray(new String[cmdList.size()]); + + ProcessBuilder pb = new ProcessBuilder(cmd); + final Process process = pb.start(); + + InputStream out = process.getInputStream(); + + String fileOutput = null; + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(out, UTF_8))) { + fileOutput = reader.readLine(); + } catch (IOException ioe) { + // file output not available! + fileOutput = ""; + } + + return fileOutput; + } + + + public static String getStringsProg() { + return System.getProperty("os.name").startsWith("Windows") ? "strings.exe" : "strings"; + } + + public static String getFileProg() { + return System.getProperty("os.name").startsWith("Windows") ? "file.exe" : "file"; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,544 @@ +/** + * ****************************************************************************** + * Copyright (C) 2005-2009, International Business Machines Corporation and * + * others. All Rights Reserved. * + * ****************************************************************************** + */ +package org.apache.tika.parser.txt; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + + +/** + * <code>CharsetDetector</code> provides a facility for detecting the + * charset or encoding of character data in an unknown format. + * The input data can either be from an input stream or an array of bytes. + * The result of the detection operation is a list of possibly matching + * charsets, or, for simple use, you can just ask for a Java Reader that + * will will work over the input data. + * <p/> + * Character set detection is at best an imprecise operation. The detection + * process will attempt to identify the charset that best matches the characteristics + * of the byte data, but the process is partly statistical in nature, and + * the results can not be guaranteed to always be correct. + * <p/> + * For best accuracy in charset detection, the input data should be primarily + * in a single language, and a minimum of a few hundred bytes worth of plain text + * in the language are needed. The detection process will attempt to + * ignore html or xml style markup that could otherwise obscure the content. + * <p/> + * @stable ICU 3.4 + */ +public class CharsetDetector { + +// Question: Should we have getters corresponding to the setters for input text +// and declared encoding? + +// A thought: If we were to create our own type of Java Reader, we could defer +// figuring out an actual charset for data that starts out with too much English +// only ASCII until the user actually read through to something that didn't look +// like 7 bit English. If nothing else ever appeared, we would never need to +// actually choose the "real" charset. All assuming that the application just +// wants the data, and doesn't care about a char set name. + + private static final int kBufSize = 12000; + private static final int MAX_CONFIDENCE = 100; + private static String[] fCharsetNames; + /* + * List of recognizers for all charsets known to the implementation. + */ + private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers(); + /* + * The following items are accessed by individual CharsetRecongizers during + * the recognition process + * + */ + byte[] fInputBytes = // The text to be checked. Markup will have been + new byte[kBufSize]; // removed if appropriate. + int fInputLen; // Length of the byte data in fInputText. + short fByteStats[] = // byte frequency statistics for the input text. + new short[256]; // Value is percent, not absolute. + boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input; + false; + String fDeclaredEncoding; + // + // Stuff private to CharsetDetector + // + byte[] fRawInput; // Original, untouched input bytes. + // If user gave us a byte array, this is it. + // If user gave us a stream, it's read to a + // buffer here. + int fRawLength; // Length of data in fRawInput array. + InputStream fInputStream; // User's input stream, or null if the user + boolean fStripTags = // If true, setText() will strip tags from input text. + false; + + /** + * Constructor + * + * @stable ICU 3.4 + */ + public CharsetDetector() { + } + + /** + * Get the names of all char sets that can be recognized by the char set detector. + * + * @return an array of the names of all charsets that can be recognized + * by the charset detector. + * + * @stable ICU 3.4 + */ + public static String[] getAllDetectableCharsets() { + return fCharsetNames; + } + + /* + * Create the singleton instances of the CharsetRecognizer classes + */ + private static ArrayList<CharsetRecognizer> createRecognizers() { + ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>(); + + recognizers.add(new CharsetRecog_UTF8()); + + recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE()); + recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE()); + recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE()); + recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE()); + + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis()); + recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP()); + recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN()); + recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR()); + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030()); + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp()); + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr()); + recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5()); + + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr()); + + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr()); + + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl()); + + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru()); + + // Create an array of all charset names, as a side effect. + // Needed for the getAllDetectableCharsets() API. + String[] charsetNames = new String[recognizers.size()]; + int out = 0; + + for (CharsetRecognizer recognizer : recognizers) { + String name = recognizer.getName(); + + if (out == 0 || !name.equals(charsetNames[out - 1])) { + charsetNames[out++] = name; + } + } + + fCharsetNames = new String[out]; + System.arraycopy(charsetNames, 0, fCharsetNames, 0, out); + + return recognizers; + } + + /** + * Set the declared encoding for charset detection. + * The declared encoding of an input text is an encoding obtained + * from an http header or xml declaration or similar source that + * can be provided as additional information to the charset detector. + * A match between a declared encoding and a possible detected encoding + * will raise the quality of that detected encoding by a small delta, + * and will also appear as a "reason" for the match. + * <p/> + * A declared encoding that is incompatible with the input data being + * analyzed will not be added to the list of possible encodings. + * + * @param encoding The declared encoding + * + * @stable ICU 3.4 + */ + public CharsetDetector setDeclaredEncoding(String encoding) { + setCanonicalDeclaredEncoding(encoding); + return this; + } + + /** + * Set the input text (byte) data whose charset is to be detected. + * + * @param in the input text of unknown encoding + * + * @return This CharsetDetector + * + * @stable ICU 3.4 + */ + public CharsetDetector setText(byte[] in) { + fRawInput = in; + fRawLength = in.length; + + MungeInput(); + + return this; + } + // Value is rounded up, so zero really means zero occurences. + + /** + * Set the input text (byte) data whose charset is to be detected. + * <p/> + * The input stream that supplies the character data must have markSupported() + * == true; the charset detection process will read a small amount of data, + * then return the stream to its original position via + * the InputStream.reset() operation. The exact amount that will + * be read depends on the characteristics of the data itself. + * + * @param in the input text of unknown encoding + * + * @return This CharsetDetector + * + * @stable ICU 3.4 + */ + + public CharsetDetector setText(InputStream in) throws IOException { + fInputStream = in; + fInputStream.mark(kBufSize); + fRawInput = new byte[kBufSize]; // Always make a new buffer because the + // previous one may have come from the caller, + // in which case we can't touch it. + fRawLength = 0; + int remainingLength = kBufSize; + while (remainingLength > 0) { + // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop. + int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); + if (bytesRead <= 0) { + break; + } + fRawLength += bytesRead; + remainingLength -= bytesRead; + } + fInputStream.reset(); + + MungeInput(); // Strip html markup, collect byte stats. + return this; + } + + /** + * Return the charset that best matches the supplied input data. + * + * Note though, that because the detection + * only looks at the start of the input data, + * there is a possibility that the returned charset will fail to handle + * the full set of input data. + * <p/> + * Raise an exception if + * <ul> + * <li>no charset appears to match the data.</li> + * <li>no input text has been provided</li> + * </ul> + * + * @return a CharsetMatch object representing the best matching charset, or + * <code>null</code> if there are no matches. + * + * @stable ICU 3.4 + */ + public CharsetMatch detect() { +// TODO: A better implementation would be to copy the detect loop from +// detectAll(), and cut it short as soon as a match with a high confidence +// is found. This is something to be done later, after things are otherwise +// working. + CharsetMatch matches[] = detectAll(); + + if (matches == null || matches.length == 0) { + return null; + } + + return matches[0]; + } + + /** + * Return an array of all charsets that appear to be plausible + * matches with the input data. The array is ordered with the + * best quality match first. + * <p/> + * Raise an exception if + * <ul> + * <li>no charsets appear to match the input data.</li> + * <li>no input text has been provided</li> + * </ul> + * + * @return An array of CharsetMatch objects representing possibly matching charsets. + * + * @stable ICU 3.4 + */ + public CharsetMatch[] detectAll() { + CharsetRecognizer csr; + int i; + int detectResults; + int confidence; + ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>(); + + // Iterate over all possible charsets, remember all that + // give a match quality > 0. + for (i = 0; i < fCSRecognizers.size(); i++) { + csr = fCSRecognizers.get(i); + detectResults = csr.match(this); + confidence = detectResults & 0x000000ff; + if (confidence > 0) { + // Just to be safe, constrain + confidence = Math.min(confidence, MAX_CONFIDENCE); + + // Apply charset hint. + if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) { + // Reduce lack of confidence (delta between "sure" and current) by 50%. + confidence += (MAX_CONFIDENCE - confidence) / 2; + } + + CharsetMatch m = new CharsetMatch(this, csr, confidence); + matches.add(m); + } + } + + Collections.sort(matches); // CharsetMatch compares on confidence + Collections.reverse(matches); // Put best match first. + CharsetMatch[] resultArray = new CharsetMatch[matches.size()]; + resultArray = matches.toArray(resultArray); + return resultArray; + } + + /** + * Autodetect the charset of an inputStream, and return a Java Reader + * to access the converted input data. + * <p/> + * This is a convenience method that is equivalent to + * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code> + * <p/> + * For the input stream that supplies the character data, markSupported() + * must be true; the charset detection will read a small amount of data, + * then return the stream to its original position via + * the InputStream.reset() operation. The exact amount that will + * be read depends on the characteristics of the data itself. + *<p/> + * Raise an exception if no charsets appear to match the input data. + * + * @param in The source of the byte data in the unknown charset. + * + * @param declaredEncoding A declared encoding for the data, if available, + * or null or an empty string if none is available. + * + * @stable ICU 3.4 + */ + public Reader getReader(InputStream in, String declaredEncoding) { + setCanonicalDeclaredEncoding(declaredEncoding); + + try { + setText(in); + + CharsetMatch match = detect(); + + if (match == null) { + return null; + } + + return match.getReader(); + } catch (IOException e) { + return null; + } + } + + /** + * Autodetect the charset of an inputStream, and return a String + * containing the converted input data. + * <p/> + * This is a convenience method that is equivalent to + * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code> + *<p/> + * Raise an exception if no charsets appear to match the input data. + * + * @param in The source of the byte data in the unknown charset. + * + * @param declaredEncoding A declared encoding for the data, if available, + * or null or an empty string if none is available. + * + * @stable ICU 3.4 + */ + public String getString(byte[] in, String declaredEncoding) { + setCanonicalDeclaredEncoding(declaredEncoding); + + try { + setText(in); + + CharsetMatch match = detect(); + + if (match == null) { + return null; + } + + return match.getString(-1); + } catch (IOException e) { + return null; + } + } + // gave us a byte array. + + /** + * Test whether or not input filtering is enabled. + * + * @return <code>true</code> if input text will be filtered. + * + * @see #enableInputFilter + * + * @stable ICU 3.4 + */ + public boolean inputFilterEnabled() { + return fStripTags; + } + + /** + * Enable filtering of input text. If filtering is enabled, + * text within angle brackets ("<" and ">") will be removed + * before detection. + * + * @param filter <code>true</code> to enable input text filtering. + * + * @return The previous setting. + * + * @stable ICU 3.4 + */ + public boolean enableInputFilter(boolean filter) { + boolean previous = fStripTags; + + fStripTags = filter; + + return previous; + } + + /** + * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists. + * + * @param encoding - name of character encoding + */ + private void setCanonicalDeclaredEncoding(String encoding) { + if ((encoding == null) || encoding.isEmpty()) { + return; + } + + Charset cs = Charset.forName(encoding); + if (cs != null) { + fDeclaredEncoding = cs.name(); + } + } + + /* + * MungeInput - after getting a set of raw input data to be analyzed, preprocess + * it by removing what appears to be html markup. + */ + private void MungeInput() { + int srci = 0; + int dsti = 0; + byte b; + boolean inMarkup = false; + int openTags = 0; + int badTags = 0; + + // + // html / xml markup stripping. + // quick and dirty, not 100% accurate, but hopefully good enough, statistically. + // discard everything within < brackets > + // Count how many total '<' and illegal (nested) '<' occur, so we can make some + // guess as to whether the input was actually marked up at all. + if (fStripTags) { + for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) { + b = fRawInput[srci]; + if (b == (byte) '<') { + if (inMarkup) { + badTags++; + } + inMarkup = true; + openTags++; + } + + if (!inMarkup) { + fInputBytes[dsti++] = b; + } + + if (b == (byte) '>') { + inMarkup = false; + } + } + + fInputLen = dsti; + } + + // + // If it looks like this input wasn't marked up, or if it looks like it's + // essentially nothing but markup abandon the markup stripping. + // Detection will have to work on the unstripped input. + // + if (openTags < 5 || openTags / 5 < badTags || + (fInputLen < 100 && fRawLength > 600)) { + int limit = fRawLength; + + if (limit > kBufSize) { + limit = kBufSize; + } + + for (srci = 0; srci < limit; srci++) { + fInputBytes[srci] = fRawInput[srci]; + } + fInputLen = srci; + } + + // + // Tally up the byte occurence statistics. + // These are available for use by the various detectors. + // + Arrays.fill(fByteStats, (short) 0); + for (srci = 0; srci < fInputLen; srci++) { + int val = fInputBytes[srci] & 0x00ff; + fByteStats[val]++; + } + + fC1Bytes = false; + for (int i = 0x80; i <= 0x9F; i += 1) { + if (fByteStats[i] != 0) { + fC1Bytes = true; + break; + } + } + } +}
