ti...

bob Tue, 05 Jan 2016 19:51:42 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * CTAKESParser decorates a {@see Parser} and leverages on 
+ * {@see CTAKESContentHandler} to extract biomedical information from 
+ * clinical text using Apache cTAKES.
+ * <p>It is normally called by supplying an instance to 
+ *  {@link AutoDetectParser}, such as:
+ * <code>AutoDetectParser parser = new AutoDetectParser(new 
CTAKESParser());</code>
+ * <p>It can also be used by giving a Tika Config file similar to:
+ * <code>
+ *  <properties>
+ *    <parsers>
+ *      <parser class="org.apache.tika.parser.ctakes.CTAKESParser">
+ *        <parser class="org.apache.tika.parser.DefaultParser"/>
+ *      </parser>
+ *    </parsers>
+ *  </properties>
+ * </code>
+ * <p>Because this is a Parser Decorator, and not a normal Parser in
+ *  it's own right, it isn't normally selected via the Parser Service Loader.
+ */
+public class CTAKESParser extends ParserDecorator {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -2313482748027097961L;
+
+    /**
+     * Wraps the default Parser
+     */
+    public CTAKESParser() {
+        this(TikaConfig.getDefaultConfig());
+    }
+    /**
+     * Wraps the default Parser for this Config
+     */
+    public CTAKESParser(TikaConfig config) {
+        this(config.getParser());
+    }
+    /**
+     * Wraps the specified Parser
+     */
+    public CTAKESParser(Parser parser) {
+        super(parser);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        CTAKESConfig config = context.get(CTAKESConfig.class,
+                new CTAKESConfig());
+        CTAKESContentHandler ctakesHandler = new CTAKESContentHandler(handler,
+                metadata, config);
+        super.parse(stream, ctakesHandler, metadata, context);
+    }
+    
+    //@Override
+    public String getDecorationName() {
+        return "CTakes";
+    }            
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.util.XmlCasSerializer;
+
+/**
+ * Enumeration for types of cTAKES (UIMA) CAS serializer supported by cTAKES.
+ * 
+ * A CAS serializer writes a CAS in the given format.
+ */
+public enum CTAKESSerializer {
+    XCAS(XCASSerializer.class.getName()),
+    XMI(XmiCasSerializer.class.getName()),
+    XML(XmlCasSerializer.class.getName());
+
+    private final String className;
+
+    private CTAKESSerializer(String className) {
+        this.className = className;
+    }
+
+    public String getClassName() {
+        return className;
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URISyntaxException;
+
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.cas.impl.XmiSerializationSharedData;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XmlCasSerializer;
+import org.xml.sax.SAXException;
+
+/**
+ * This class provides methods to extract biomedical information from plain 
text
+ * using {@see CTAKESContentHandler} that relies on Apache cTAKES.
+ * 
+ * <p>
+ * Apache cTAKES is built on top of <a href="https://uima.apache.org/";>Apache
+ * UIMA</a> framework and <a href="https://opennlp.apache.org/";>OpenNLP</a>
+ * toolkit.
+ * </p>
+ */
+public class CTAKESUtils {
+       // UMLS username property
+       private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
+
+       // UMLS password property
+       private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
+
+       /**
+        * Returns a new UIMA Analysis Engine (AE). This method ensures that 
only
+        * one instance of an AE is created.
+        * 
+        * <p>
+        * An Analysis Engine is a component responsible for analyzing 
unstructured
+        * information, discovering and representing semantic content. 
Unstructured
+        * information includes, but is not restricted to, text documents.
+        * </p>
+        * 
+        * @param aeDescriptor
+        *            pathname for XML file including an 
AnalysisEngineDescription
+        *            that contains all of the information needed to 
instantiate and
+        *            use an AnalysisEngine.
+        * @param umlsUser
+        *            UMLS username for NLM database
+        * @param umlsPass
+        *            UMLS password for NLM database
+        * @return an Analysis Engine for analyzing unstructured information.
+        * @throws IOException
+        *             if any I/O error occurs.
+        * @throws InvalidXMLException
+        *             if the input XML is not valid or does not specify a valid
+        *             ResourceSpecifier.
+        * @throws ResourceInitializationException
+        *             if a failure occurred during production of the resource.
+        * @throws URISyntaxException
+        *             if URL of the resource is not formatted strictly 
according to
+        *             to RFC2396 and cannot be converted to a URI.
+        */
+       public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
+                       String umlsUser, String umlsPass) throws IOException,
+                       InvalidXMLException, ResourceInitializationException,
+                       URISyntaxException {
+               // UMLS user ID and password.
+               String aeDescriptorPath = 
CTAKESUtils.class.getResource(aeDescriptor)
+                               .toURI().getPath();
+
+               // get Resource Specifier from XML
+               XMLInputSource aeIputSource = new 
XMLInputSource(aeDescriptorPath);
+               ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser()
+                               .parseResourceSpecifier(aeIputSource);
+
+               // UMLS user ID and password
+               if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != 
null)
+                               && (!umlsPass.isEmpty())) {
+                       /*
+                        * It is highly recommended that you change UMLS 
credentials in the
+                        * XML configuration file instead of giving user and 
password using
+                        * CTAKESConfig.
+                        */
+                       System.setProperty(CTAKES_UMLS_USER, umlsUser);
+                       System.setProperty(CTAKES_UMLS_PASS, umlsPass);
+               }
+
+               // create AE
+               AnalysisEngine ae = 
UIMAFramework.produceAnalysisEngine(aeSpecifier);
+
+               return ae;
+       }
+
+       /**
+        * Returns a new JCas () appropriate for the given Analysis Engine. This
+        * method ensures that only one instance of a JCas is created. A Jcas 
is a
+        * Java Cover Classes based Object-oriented CAS (Common Analysis System)
+        * API.
+        * 
+        * <p>
+        * Important: It is highly recommended that you reuse CAS objects rather
+        * than creating new CAS objects prior to each analysis. This is 
because CAS
+        * objects may be expensive to create and may consume a significant 
amount
+        * of memory.
+        * </p>
+        * 
+        * @param ae
+        *            AnalysisEngine used to create an appropriate JCas object.
+        * @return a JCas object appropriate for the given AnalysisEngine.
+        * @throws ResourceInitializationException
+        *             if a CAS could not be created because this 
AnalysisEngine's
+        *             CAS metadata (type system, type priorities, or FS 
indexes)
+        *             are invalid.
+        */
+       public static JCas getJCas(AnalysisEngine ae)
+                       throws ResourceInitializationException {
+               JCas jcas = ae.newJCas();
+               
+               return jcas;
+       }
+
+       /**
+        * Serializes a CAS in the given format.
+        * 
+        * @param jcas
+        *            CAS (Common Analysis System) to be serialized.
+        * @param type
+        *            type of cTAKES (UIMA) serializer used to write CAS.
+        * @param prettyPrint
+        *            {@code true} to do pretty printing of output.
+        * @param stream
+        *            {@see OutputStream} object used to print out information
+        *            extracted by using cTAKES.
+        * @throws SAXException
+        *             if there was a SAX exception.
+        * @throws IOException
+        *             if any I/O error occurs.
+        */
+       public static void serialize(JCas jcas, CTAKESSerializer type, boolean 
prettyPrint,
+                       OutputStream stream) throws SAXException, IOException {
+               if (type == CTAKESSerializer.XCAS) {
+                       XCASSerializer.serialize(jcas.getCas(), stream, 
prettyPrint);
+               } else if (type == CTAKESSerializer.XMI) {
+                       XmiCasSerializer.serialize(jcas.getCas(), 
jcas.getTypeSystem(),
+                                       stream, prettyPrint, new 
XmiSerializationSharedData());
+               } else {
+                       XmlCasSerializer.serialize(jcas.getCas(), 
jcas.getTypeSystem(),
+                                       stream);
+               }
+       }
+
+       /**
+        * Returns the annotation value based on the given annotation type.
+        * 
+        * @param annotation
+        *            {@see IdentifiedAnnotation} object.
+        * @param property
+        *            {@see CTAKESAnnotationProperty} enum used to identify the
+        *            annotation type.
+        * @return the annotation value.
+        */
+       public static String getAnnotationProperty(IdentifiedAnnotation 
annotation,
+                       CTAKESAnnotationProperty property) {
+               String value = null;
+               if (property == CTAKESAnnotationProperty.BEGIN) {
+                       value = Integer.toString(annotation.getBegin());
+               } else if (property == CTAKESAnnotationProperty.END) {
+                       value = Integer.toString(annotation.getEnd());
+               } else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
+                       value = Boolean.toString(annotation.getConditional());
+               } else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
+                       value = Float.toString(annotation.getConfidence());
+               } else if (property == 
CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
+                       value = 
Integer.toString(annotation.getDiscoveryTechnique());
+               } else if (property == CTAKESAnnotationProperty.GENERIC) {
+                       value = Boolean.toString(annotation.getGeneric());
+               } else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
+                       value = Integer.toString(annotation.getHistoryOf());
+               } else if (property == CTAKESAnnotationProperty.ID) {
+                       value = Integer.toString(annotation.getId());
+               } else if (property == 
CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
+                       FSArray mentions = annotation.getOntologyConceptArr();
+                       StringBuilder sb = new StringBuilder();
+                       if (mentions != null) {
+                               for (int i = 0; i < mentions.size(); i++) {
+                                       if (mentions.get(i) instanceof 
UmlsConcept) {
+                                               UmlsConcept concept = 
(UmlsConcept) mentions.get(i);
+                                               sb.append(concept.getCui());
+                                               if (i < mentions.size() - 1) {
+                                                       sb.append(",");
+                                               }
+                                       }
+                               }
+                       }
+                       value = sb.toString();
+               } else if (property == CTAKESAnnotationProperty.POLARITY) {
+                       value = Integer.toString(annotation.getPolarity());
+               }
+               return value;
+       }
+
+       /**
+        * Resets cTAKES objects, if created. This method ensures that new 
cTAKES
+        * objects (a.k.a., Analysis Engine and JCas) will be created if 
getters of
+        * this class are called.
+        * 
+        * @param ae UIMA Analysis Engine
+        * @param jcas JCas object
+        */
+       public static void reset(AnalysisEngine ae, JCas jcas) {
+               // Analysis Engine
+               resetAE(ae);
+
+               // JCas
+               resetCAS(jcas);
+               jcas = null;
+       }
+
+       /**
+        * Resets the CAS (Common Analysis System), emptying it of all content.
+        * 
+        * @param jcas JCas object
+        */
+       public static void resetCAS(JCas jcas) {
+               if (jcas != null) {
+                       jcas.reset();
+               }
+       }
+
+       /**
+        * Resets the AE (AnalysisEngine), releasing all resources held by the
+        * current AE.
+        * 
+        * @param ae UIMA Analysis Engine
+        */
+       public static void resetAE(AnalysisEngine ae) {
+               if (ae != null) {
+                       ae.destroy();
+                       ae = null;
+               }
+       }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dif;
+
+import java.util.Stack;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class DIFContentHandler extends DefaultHandler {
+
+       private static final char[] NEWLINE = new char[] { '\n' };
+       private static final char[] TABSPACE = new char[] { '\t' };
+       private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+       private Stack<String> treeStack;
+       private Stack<String> dataStack;
+       private final ContentHandler delegate;
+       private boolean isLeaf;
+       private Metadata metadata;
+
+       public DIFContentHandler(ContentHandler delegate, Metadata metadata) {
+               this.delegate = delegate;
+               this.isLeaf = false;
+               this.metadata = metadata;
+               this.treeStack = new Stack<String>();
+               this.dataStack = new Stack<String>();
+       }
+
+       @Override
+       public void setDocumentLocator(org.xml.sax.Locator locator) {
+               delegate.setDocumentLocator(locator);
+       }
+
+       @Override
+       public void characters(char[] ch, int start, int length)
+                       throws SAXException {
+               String value = (new String(ch, start, length)).toString();
+               this.dataStack.push(value);
+
+               if (this.treeStack.peek().equals("Entry_Title")) {
+                       this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+                       this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+                       this.delegate.startElement("", "h3", "h3", 
EMPTY_ATTRIBUTES);
+                       String title = "Title: ";
+                       title = title + value;
+                       this.delegate.characters(title.toCharArray(), 0, 
title.length());
+                       this.delegate.endElement("", "h3", "h3");
+               }
+               if (this.treeStack.peek().equals("Southernmost_Latitude")
+                               || 
this.treeStack.peek().equals("Northernmost_Latitude")
+                               || 
this.treeStack.peek().equals("Westernmost_Longitude")
+                               || 
this.treeStack.peek().equals("Easternmost_Longitude")) {
+                       this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+                       this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+                       this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+                       this.delegate.startElement("", "tr", "tr", 
EMPTY_ATTRIBUTES);
+                       this.delegate.startElement("", "td", "td", 
EMPTY_ATTRIBUTES);
+                       String key = this.treeStack.peek() + " : ";
+                       this.delegate.characters(key.toCharArray(), 0, 
key.length());
+                       this.delegate.endElement("", "td", "td");
+                       this.delegate.startElement("", "td", "td", 
EMPTY_ATTRIBUTES);
+                       this.delegate.characters(value.toCharArray(), 0, 
value.length());
+                       this.delegate.endElement("", "td", "td");
+                       this.delegate.endElement("", "tr", "tr");
+               }
+       }
+
+       @Override
+       public void ignorableWhitespace(char[] ch, int start, int length)
+                       throws SAXException {
+               delegate.ignorableWhitespace(ch, start, length);
+       }
+
+       @Override
+       public void startElement(String uri, String localName, String qName,
+                       Attributes attributes) throws SAXException {
+               this.isLeaf = true;
+               if (localName.equals("Spatial_Coverage")) {
+                       this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+                       this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+                       this.delegate.startElement("", "h3", "h3", 
EMPTY_ATTRIBUTES);
+                       String value = "Geographic Data: ";
+                       this.delegate.characters(value.toCharArray(), 0, 
value.length());
+                       this.delegate.endElement("", "h3", "h3");
+                       this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+                       this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+                       this.delegate.startElement("", "table", "table", 
EMPTY_ATTRIBUTES);
+               }
+               this.treeStack.push(localName);
+       }
+
+       @Override
+       public void endElement(String uri, String localName, String qName)
+                       throws SAXException {
+               if (localName.equals("Spatial_Coverage")) {
+                       this.delegate.characters(NEWLINE, 0, NEWLINE.length);
+                       this.delegate.characters(TABSPACE, 0, TABSPACE.length);
+                       this.delegate.endElement("", "table", "table");
+               }
+               if (this.isLeaf) {
+                       Stack<String> tempStack = (Stack<String>) 
this.treeStack.clone();
+                       String key = "";
+                       while (!tempStack.isEmpty()) {
+                               if (key.length() == 0) {
+                                       key = tempStack.pop();
+                               } else {
+                                       key = tempStack.pop() + "-" + key;
+                               }
+                       }
+                       String value = this.dataStack.peek();
+                       this.metadata.add(key, value);
+                       this.isLeaf = false;
+               }
+               this.treeStack.pop();
+               this.dataStack.pop();
+       }
+
+       @Override
+       public void startDocument() throws SAXException {
+               delegate.startDocument();
+       }
+
+       @Override
+       public void endDocument() throws SAXException {
+               delegate.endDocument();
+       }
+
+       @Override
+       public String toString() {
+               return delegate.toString();
+       }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dif;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class DIFParser extends AbstractParser {
+
+       /**
+        * 
+        */
+       private static final long serialVersionUID = 971505521275777826L;
+       private static final Set<MediaType> SUPPORTED_TYPES = Collections
+                       .unmodifiableSet(new 
HashSet<MediaType>(Arrays.asList(MediaType.application("dif+xml"))));
+
+       @Override
+       public Set<MediaType> getSupportedTypes(ParseContext context) {
+               // TODO Auto-generated method stub
+               return SUPPORTED_TYPES;
+       }
+
+       @Override
+       public void parse(InputStream stream, ContentHandler handler,
+                       Metadata metadata, ParseContext context) throws 
IOException,
+                       SAXException, TikaException {
+               // TODO Auto-generated method stub
+               final XHTMLContentHandler xhtml = new 
XHTMLContentHandler(handler,
+                               metadata);
+               xhtml.startDocument();
+               xhtml.startElement("p");
+               TaggedContentHandler tagged = new TaggedContentHandler(handler);
+               try {
+                       context.getSAXParser().parse(
+                                       new CloseShieldInputStream(stream),
+                                       new OfflineContentHandler(new 
EmbeddedContentHandler(
+                                                       
getContentHandler(tagged, metadata, context))));
+               } catch (SAXException e) {
+                       tagged.throwIfCauseOf(e);
+                       throw new TikaException("XML parse error", e);
+               } finally {
+                       xhtml.endElement("p");
+                       xhtml.endDocument();
+               }
+
+       }
+
+       protected ContentHandler getContentHandler(ContentHandler handler,
+                       Metadata metadata, ParseContext context) {
+               
+               return new DIFContentHandler(handler, metadata);
+
+       }
+
+}
\ No newline at end of file

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ */
+package org.apache.tika.parser.envi;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import java.nio.charset.Charset;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class EnviHeaderParser extends AbstractParser {
+
+    private static final long serialVersionUID = -1479368523072408091L;
+
+    public static final String ENVI_MIME_TYPE = "application/envi.hdr";
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.application("envi.hdr"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        // Only outputting the MIME type as metadata
+        metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
+
+        // The following code was taken from the TXTParser
+        // Automatically detect the character encoding
+
+        try (AutoDetectReader reader = new AutoDetectReader(
+                new CloseShieldInputStream(stream), metadata)) {
+            Charset charset = reader.getCharset();
+            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
+                    metadata);
+
+            xhtml.startDocument();
+
+            // text contents of the xhtml
+            String line;
+            while ((line = reader.readLine()) != null) {
+                xhtml.startElement("p");
+                xhtml.characters(line);
+                xhtml.endElement("p");
+            }
+
+            xhtml.endDocument();
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.gdal;
+
+//JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.parser.external.ExternalParser.INPUT_FILE_TOKEN;
+
+//Tika imports
+//SAX imports
+
+/**
+ * Wraps execution of the <a href="http//gdal.org/">Geospatial Data Abstraction
+ * Library (GDAL)</a> <code>gdalinfo</code> tool used to extract geospatial
+ * information out of hundreds of geo file formats.
+ * <p/>
+ * The parser requires the installation of GDAL and for <code>gdalinfo</code> 
to
+ * be located on the path.
+ * <p/>
+ * Basic information (Size, Coordinate System, Bounding Box, Driver, and
+ * resource info) are extracted as metadata, and the remaining metadata 
patterns
+ * are extracted and added.
+ * <p/>
+ * The output of the command is available from the provided
+ * {@link ContentHandler} in the
+ * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} method.
+ */
+public class GDALParser extends AbstractParser {
+
+    private static final long serialVersionUID = -3869130527323941401L;
+
+    private String command;
+
+    public GDALParser() {
+        setCommand("gdalinfo ${INPUT}");
+    }
+
+    public void setCommand(String command) {
+        this.command = command;
+    }
+
+    public String getCommand() {
+        return this.command;
+    }
+
+    public String processCommand(InputStream stream) {
+        TikaInputStream tis = (TikaInputStream) stream;
+        String pCommand = this.command;
+        try {
+            if (this.command.contains(INPUT_FILE_TOKEN)) {
+                pCommand = this.command.replace(INPUT_FILE_TOKEN, tis.getFile()
+                        .getPath());
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+
+        return pCommand;
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        Set<MediaType> types = new HashSet<MediaType>();
+        types.add(MediaType.application("x-netcdf"));
+        types.add(MediaType.application("vrt"));
+        types.add(MediaType.image("geotiff"));
+        types.add(MediaType.image("nitf"));
+        types.add(MediaType.application("x-rpf-toc"));
+        types.add(MediaType.application("x-ecrg-toc"));
+        types.add(MediaType.image("hfa"));
+        types.add(MediaType.image("sar-ceos"));
+        types.add(MediaType.image("ceos"));
+        types.add(MediaType.application("jaxa-pal-sar"));
+        types.add(MediaType.application("gff"));
+        types.add(MediaType.application("elas"));
+        types.add(MediaType.application("aig"));
+        types.add(MediaType.application("aaigrid"));
+        types.add(MediaType.application("grass-ascii-grid"));
+        types.add(MediaType.application("sdts-raster"));
+        types.add(MediaType.application("dted"));
+        types.add(MediaType.image("png"));
+        types.add(MediaType.image("jpeg"));
+        types.add(MediaType.image("raster"));
+        types.add(MediaType.application("jdem"));
+        types.add(MediaType.image("gif"));
+        types.add(MediaType.image("big-gif"));
+        types.add(MediaType.image("envisat"));
+        types.add(MediaType.image("fits"));
+        types.add(MediaType.application("fits"));
+        types.add(MediaType.image("bsb"));
+        types.add(MediaType.application("xpm"));
+        types.add(MediaType.image("bmp"));
+        types.add(MediaType.image("x-dimap"));
+        types.add(MediaType.image("x-airsar"));
+        types.add(MediaType.application("x-rs2"));
+        types.add(MediaType.application("x-pcidsk"));
+        types.add(MediaType.application("pcisdk"));
+        types.add(MediaType.image("x-pcraster"));
+        types.add(MediaType.image("ilwis"));
+        types.add(MediaType.image("sgi"));
+        types.add(MediaType.application("x-srtmhgt"));
+        types.add(MediaType.application("leveller"));
+        types.add(MediaType.application("terragen"));
+        types.add(MediaType.application("x-gmt"));
+        types.add(MediaType.application("x-isis3"));
+        types.add(MediaType.application("x-isis2"));
+        types.add(MediaType.application("x-pds"));
+        types.add(MediaType.application("x-til"));
+        types.add(MediaType.application("x-ers"));
+        types.add(MediaType.application("x-l1b"));
+        types.add(MediaType.image("fit"));
+        types.add(MediaType.application("x-grib"));
+        types.add(MediaType.image("jp2"));
+        types.add(MediaType.application("x-rmf"));
+        types.add(MediaType.application("x-wcs"));
+        types.add(MediaType.application("x-wms"));
+        types.add(MediaType.application("x-msgn"));
+        types.add(MediaType.application("x-wms"));
+        types.add(MediaType.application("x-wms"));
+        types.add(MediaType.application("x-rst"));
+        types.add(MediaType.application("x-ingr"));
+        types.add(MediaType.application("x-gsag"));
+        types.add(MediaType.application("x-gsbg"));
+        types.add(MediaType.application("x-gs7bg"));
+        types.add(MediaType.application("x-cosar"));
+        types.add(MediaType.application("x-tsx"));
+        types.add(MediaType.application("x-coasp"));
+        types.add(MediaType.application("x-r"));
+        types.add(MediaType.application("x-map"));
+        types.add(MediaType.application("x-pnm"));
+        types.add(MediaType.application("x-doq1"));
+        types.add(MediaType.application("x-doq2"));
+        types.add(MediaType.application("x-envi"));
+        types.add(MediaType.application("x-envi-hdr"));
+        types.add(MediaType.application("x-generic-bin"));
+        types.add(MediaType.application("x-p-aux"));
+        types.add(MediaType.image("x-mff"));
+        types.add(MediaType.image("x-mff2"));
+        types.add(MediaType.image("x-fujibas"));
+        types.add(MediaType.application("x-gsc"));
+        types.add(MediaType.application("x-fast"));
+        types.add(MediaType.application("x-bt"));
+        types.add(MediaType.application("x-lan"));
+        types.add(MediaType.application("x-cpg"));
+        types.add(MediaType.image("ida"));
+        types.add(MediaType.application("x-ndf"));
+        types.add(MediaType.image("eir"));
+        types.add(MediaType.application("x-dipex"));
+        types.add(MediaType.application("x-lcp"));
+        types.add(MediaType.application("x-gtx"));
+        types.add(MediaType.application("x-los-las"));
+        types.add(MediaType.application("x-ntv2"));
+        types.add(MediaType.application("x-ctable2"));
+        types.add(MediaType.application("x-ace2"));
+        types.add(MediaType.application("x-snodas"));
+        types.add(MediaType.application("x-kro"));
+        types.add(MediaType.image("arg"));
+        types.add(MediaType.application("x-rik"));
+        types.add(MediaType.application("x-usgs-dem"));
+        types.add(MediaType.application("x-gxf"));
+        types.add(MediaType.application("x-dods"));
+        types.add(MediaType.application("x-http"));
+        types.add(MediaType.application("x-bag"));
+        types.add(MediaType.application("x-hdf"));
+        types.add(MediaType.image("x-hdf5-image"));
+        types.add(MediaType.application("x-nwt-grd"));
+        types.add(MediaType.application("x-nwt-grc"));
+        types.add(MediaType.image("adrg"));
+        types.add(MediaType.image("x-srp"));
+        types.add(MediaType.application("x-blx"));
+        types.add(MediaType.application("x-rasterlite"));
+        types.add(MediaType.application("x-epsilon"));
+        types.add(MediaType.application("x-sdat"));
+        types.add(MediaType.application("x-kml"));
+        types.add(MediaType.application("x-xyz"));
+        types.add(MediaType.application("x-geo-pdf"));
+        types.add(MediaType.image("x-ozi"));
+        types.add(MediaType.application("x-ctg"));
+        types.add(MediaType.application("x-e00-grid"));
+        types.add(MediaType.application("x-zmap"));
+        types.add(MediaType.application("x-webp"));
+        types.add(MediaType.application("x-ngs-geoid"));
+        types.add(MediaType.application("x-mbtiles"));
+        types.add(MediaType.application("x-ppi"));
+        types.add(MediaType.application("x-cappi"));
+        return types;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws 
IOException,
+            SAXException, TikaException {
+
+        if (!ExternalParser.check("gdalinfo")) {
+            return;
+        }
+
+        // first set up and run GDAL
+        // process the command
+        TemporaryResources tmp = new TemporaryResources();
+        TikaInputStream tis = TikaInputStream.get(stream, tmp);
+
+        String runCommand = processCommand(tis);
+        String output = execCommand(new String[]{runCommand});
+
+        // now extract the actual metadata params
+        // from the GDAL output in the content stream
+        // to do this, we need to literally process the output
+        // from the invoked command b/c we can't read metadata and
+        // output text from the handler in ExternalParser
+        // at the same time, so for now, we can't use the
+        // ExternalParser to do this and I've had to bring some of
+        // that functionality directly into this class
+        // TODO: investigate a way to do both using ExternalParser
+
+        extractMetFromOutput(output, metadata);
+        applyPatternsToOutput(output, metadata, getPatterns());
+
+        // make the content handler and provide output there
+        // now that we have metadata
+        processOutput(handler, metadata, output);
+    }
+
+    private Map<Pattern, String> getPatterns() {
+        Map<Pattern, String> patterns = new HashMap<Pattern, String>();
+        this.addPatternWithColon("Driver", patterns);
+        this.addPatternWithColon("Files", patterns);
+        this.addPatternWithIs("Size", patterns);
+        this.addPatternWithIs("Coordinate System", patterns);
+        this.addBoundingBoxPattern("Upper Left", patterns);
+        this.addBoundingBoxPattern("Lower Left", patterns);
+        this.addBoundingBoxPattern("Upper Right", patterns);
+        this.addBoundingBoxPattern("Lower Right", patterns);
+        return patterns;
+    }
+
+    private void addPatternWithColon(String name, Map<Pattern, String> 
patterns) {
+        patterns.put(
+                Pattern.compile(name + "\\:\\s*([A-Za-z0-9/ _\\-\\.]+)\\s*"),
+                name);
+    }
+
+    private void addPatternWithIs(String name, Map<Pattern, String> patterns) {
+        patterns.put(Pattern.compile(name + " is ([A-Za-z0-9\\.,\\s`']+)"),
+                name);
+    }
+
+    private void addBoundingBoxPattern(String name,
+                                       Map<Pattern, String> patterns) {
+        patterns.put(
+                Pattern.compile(name
+                        + 
"\\s*\\(\\s*([0-9]+\\.[0-9]+\\s*,\\s*[0-9]+\\.[0-9]+\\s*)\\)\\s*"),
+                name);
+    }
+
+    private void extractMetFromOutput(String output, Metadata met) {
+        Scanner scanner = new Scanner(output);
+        String currentKey = null;
+        String[] headings = {"Subdatasets", "Corner Coordinates"};
+        StringBuilder metVal = new StringBuilder();
+        while (scanner.hasNextLine()) {
+            String line = scanner.nextLine();
+            if (line.contains("=") || hasHeadings(line, headings)) {
+                if (currentKey != null) {
+                    // time to flush this key and met val
+                    met.add(currentKey, metVal.toString());
+                }
+                metVal.setLength(0);
+
+                String[] lineToks = line.split("=");
+                currentKey = lineToks[0].trim();
+                if (lineToks.length == 2) {
+                    metVal.append(lineToks[1]);
+                } else {
+                    metVal.append("");
+                }
+            } else {
+                metVal.append(line);
+            }
+
+        }
+    }
+
+    private boolean hasHeadings(String line, String[] headings) {
+        if (headings != null && headings.length > 0) {
+            for (String heading : headings) {
+                if (line.contains(heading)) {
+                    return true;
+                }
+            }
+            return false;
+        } else return false;
+    }
+
+    private void applyPatternsToOutput(String output, Metadata metadata,
+                                       Map<Pattern, String> metadataPatterns) {
+        Scanner scanner = new Scanner(output);
+        while (scanner.hasNextLine()) {
+            String line = scanner.nextLine();
+            for (Pattern p : metadataPatterns.keySet()) {
+                Matcher m = p.matcher(line);
+                if (m.find()) {
+                    if (metadataPatterns.get(p) != null
+                            && !metadataPatterns.get(p).equals("")) {
+                        metadata.add(metadataPatterns.get(p), m.group(1));
+                    } else {
+                        metadata.add(m.group(1), m.group(2));
+                    }
+                }
+            }
+        }
+
+    }
+
+    private String execCommand(String[] cmd) throws IOException {
+        // Execute
+        Process process;
+        String output = null;
+        if (cmd.length == 1) {
+            process = Runtime.getRuntime().exec(cmd[0]);
+        } else {
+            process = Runtime.getRuntime().exec(cmd);
+        }
+
+        try {
+            InputStream out = process.getInputStream();
+
+            try {
+                output = extractOutput(out);
+            } catch (Exception e) {
+                e.printStackTrace();
+                output = "";
+            }
+
+        } finally {
+            try {
+                process.waitFor();
+            } catch (InterruptedException ignore) {
+            }
+        }
+        return output;
+
+    }
+
+    private String extractOutput(InputStream stream) throws SAXException,
+            IOException {
+        StringBuilder sb = new StringBuilder();
+        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+            char[] buffer = new char[1024];
+            for (int n = reader.read(buffer); n != -1; n = 
reader.read(buffer)) {
+                sb.append(buffer, 0, n);
+            }
+        }
+        return sb.toString();
+    }
+
+    private void processOutput(ContentHandler handler, Metadata metadata,
+                               String output) throws SAXException, IOException 
{
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8));
+        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+            xhtml.startDocument();
+            xhtml.startElement("p");
+            char[] buffer = new char[1024];
+            for (int n = reader.read(buffer); n != -1; n = 
reader.read(buffer)) {
+                xhtml.characters(buffer, 0, n);
+            }
+            xhtml.endElement("p");
+
+        } finally {
+            xhtml.endDocument();
+        }
+
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright 
owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteException;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.exec.environment.EnvironmentUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.JSONValue;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParser extends AbstractParser {
+    private static final long serialVersionUID = -2241391757440215491L;
+    private static final Logger LOG = 
Logger.getLogger(GeoParser.class.getName());
+    private static final MediaType MEDIA_TYPE = 
+                                    MediaType.application("geotopic");
+    private static final Set<MediaType> SUPPORTED_TYPES = 
+                                    Collections.singleton(MEDIA_TYPE);
+    
+    private GeoParserConfig config = new GeoParserConfig();
+
+    private boolean initialized;
+    private URL modelUrl;
+    private NameEntityExtractor extractor;
+    private boolean available;
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Initializes this parser
+     * @param modelUrl the URL to NER model
+     */
+    public void initialize(URL modelUrl) {
+        if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) {
+            // Previously initialized for the same URL, no initialization 
needed
+            return;
+        }
+        
+        this.modelUrl = modelUrl;
+        
+        // Check if the NER model is available, and if the
+        //  lucene-geo-gazetteer is available
+        this.available = modelUrl != null && ExternalParser.check(
+                new String[] { "lucene-geo-gazetteer", "--help" }, -1);
+        if (this.available) {
+            try {
+                this.extractor = new NameEntityExtractor(modelUrl);
+            } catch (Exception e) {
+                LOG.warning("Named Entity Extractor setup failed: " + e);
+                this.available = false;
+            }
+        }
+        initialized = true;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        /*----------------configure this parser by ParseContext 
Object---------------------*/
+
+        this.config = context.get(GeoParserConfig.class, config);
+        initialize(this.config.getNerModelUrl());
+        if (!isAvailable()) {
+            return;
+        }
+
+        /*----------------get locationNameEntities and best nameEntity for the 
input stream---------------------*/
+        extractor.getAllNameEntitiesfromInput(stream);
+        extractor.getBestNameEntity();
+        ArrayList<String> locationNameEntities = 
extractor.locationNameEntities;
+        String bestner = extractor.bestNameEntity;
+
+        /*------------------------resolve geonames for each ner, store results 
in a hashmap---------------------*/
+        HashMap<String, ArrayList<String>> resolvedGeonames = 
searchGeoNames(locationNameEntities);
+
+        /*----------------store locationNameEntities and their geonames in a 
geotag, each input has one geotag---------------------*/
+        GeoTag geotag = new GeoTag();
+        geotag.toGeoTag(resolvedGeonames, bestner);
+
+        /* add resolved entities in metadata */
+
+        metadata.add("Geographic_NAME", geotag.Geographic_NAME);
+        metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
+        metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+        for (int i = 0; i < geotag.alternatives.size(); ++i) {
+            GeoTag alter = (GeoTag) geotag.alternatives.get(i);
+            metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+            metadata.add("Optional_LONGITUDE" + (i + 1),
+                         alter.Geographic_LONGTITUDE);
+            metadata.add("Optional_LATITUDE" + (i + 1),
+                         alter.Geographic_LATITUDE);
+        }
+    }
+
+    public HashMap<String, ArrayList<String>> searchGeoNames(
+            ArrayList<String> locationNameEntities) throws ExecuteException,
+            IOException {
+        CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
+        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+        cmdLine.addArgument("-s");
+        for (String name : locationNameEntities) {
+            cmdLine.addArgument(name);
+        }
+
+        LOG.fine("Executing: " + cmdLine);
+        DefaultExecutor exec = new DefaultExecutor();
+        exec.setExitValue(0);
+        ExecuteWatchdog watchdog = new ExecuteWatchdog(60000);
+        exec.setWatchdog(watchdog);
+        PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+        exec.setStreamHandler(streamHandler);
+        int exitValue = exec.execute(cmdLine, 
EnvironmentUtils.getProcEnvironment());
+        String outputJson = outputStream.toString("UTF-8");
+        JSONArray json = (JSONArray) JSONValue.parse(outputJson);
+
+        HashMap<String, ArrayList<String>> returnHash = new HashMap<String, 
ArrayList<String>>();
+        for (int i = 0; i < json.size(); i++) {
+            JSONObject obj = (JSONObject) json.get(i);
+            for (Object key : obj.keySet()) {
+                String theKey = (String) key;
+                JSONArray vals = (JSONArray) obj.get(theKey);
+                ArrayList<String> stringVals = new ArrayList<String>(
+                        vals.size());
+                for (int j = 0; j < vals.size(); j++) {
+                    String val = (String) vals.get(j);
+                    stringVals.add(val);
+                }
+
+                returnHash.put(theKey, stringVals);
+            }
+        }
+
+        return returnHash;
+    }
+
+    public boolean isAvailable() {
+        if (!initialized) {
+            initialize(config.getNerModelUrl());
+        }
+        return this.available;
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.File;
+import java.io.Serializable;
+import java.net.MalformedURLException;
+import java.net.URL;
+
+public class GeoParserConfig implements Serializable {
+    private static final long serialVersionUID = -3167692634278575818L;
+    private URL nerModelUrl = null;
+
+    public GeoParserConfig() {
+        this.nerModelUrl = 
GeoParserConfig.class.getResource("en-ner-location.bin");
+    }
+
+    public void setNERModelPath(String path) {
+        if (path == null)
+            return;
+        File file = new File(path);
+        if (file.isDirectory() || !file.exists()) {
+            return;
+        }
+        try {
+            this.nerModelUrl = file.toURI().toURL();
+        } catch (MalformedURLException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void setNerModelUrl(URL url) {
+        this.nerModelUrl = url;
+    }
+    public URL getNerModelUrl() {
+        return nerModelUrl;
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class GeoTag {
+       String Geographic_NAME;
+       String Geographic_LONGTITUDE;
+       String Geographic_LATITUDE;
+       ArrayList<GeoTag> alternatives = new ArrayList<GeoTag>();
+
+       public void setMain(String name, String longitude, String latitude) {
+               Geographic_NAME = name;
+               Geographic_LONGTITUDE = longitude;
+               Geographic_LATITUDE = latitude;
+       }
+
+       public void addAlternative(GeoTag geotag) {
+               alternatives.add(geotag);
+       }
+
+       /*
+        * Store resolved geoName entities in a GeoTag
+        * 
+        * @param resolvedGeonames resolved entities
+        * 
+        * @param bestNER best name entity among all the extracted entities for 
the
+        * input stream
+        */
+       public void toGeoTag(HashMap<String, ArrayList<String>> 
resolvedGeonames,
+                       String bestNER) {
+
+               for (String key : resolvedGeonames.keySet()) {
+                       ArrayList<String> cur = resolvedGeonames.get(key);
+                       if (key.equals(bestNER)) {
+                               this.Geographic_NAME = cur.get(0);
+                               this.Geographic_LONGTITUDE = cur.get(1);
+                               this.Geographic_LATITUDE = cur.get(2);
+                       } else {
+                               GeoTag alter = new GeoTag();
+                               alter.Geographic_NAME = cur.get(0);
+                               alter.Geographic_LONGTITUDE = cur.get(1);
+                               alter.Geographic_LATITUDE = cur.get(2);
+                               this.addAlternative(alter);
+                       }
+               }
+       }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+import org.apache.commons.io.IOUtils;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class NameEntityExtractor {
+    ArrayList<String> locationNameEntities;
+    String bestNameEntity;
+    private HashMap<String, Integer> tf;
+    private final NameFinderME nameFinder;
+
+    public NameEntityExtractor(URL modelUrl) throws IOException {
+        this.locationNameEntities = new ArrayList<String>();
+        this.bestNameEntity = null;
+        TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
+        this.nameFinder = new NameFinderME(model);
+        this.tf = new HashMap<String, Integer>();
+    }
+
+    /*
+     * Use OpenNLP to extract location names that's appearing in the steam.
+     * OpenNLP's default Name Finder accuracy is not very good, please refer to
+     * its documentation.
+     * 
+     * @param stream stream that passed from this.parse()
+     */
+    public void getAllNameEntitiesfromInput(InputStream stream) throws 
IOException {
+        String[] in = IOUtils.toString(stream, UTF_8).split(" ");
+        Span nameE[];
+        
+        //name finder is not thread safe 
https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind
+        synchronized (nameFinder) {
+            nameE = nameFinder.find(in);
+            //the same name finder is reused, so clear adaptive data
+            nameFinder.clearAdaptiveData();
+        }
+
+        String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
+        spanNames = spanNames.substring(1, spanNames.length() - 1);
+        String[] tmp = spanNames.split(",");
+
+        for (String name : tmp) {
+            name = name.trim();
+            this.locationNameEntities.add(name);
+        }
+
+
+    }
+
+    /*
+     * Get the best location entity extracted from the input stream. Simply
+     * return the most frequent entity, If there several highest frequent
+     * entity, pick one randomly. May not be the optimal solution, but works.
+     * 
+     * @param locationNameEntities OpenNLP name finder's results, stored in
+     * ArrayList
+     */
+    public void getBestNameEntity() {
+        if (this.locationNameEntities.size() == 0)
+            return;
+
+        for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+            if (tf.containsKey(this.locationNameEntities.get(i)))
+                tf.put(this.locationNameEntities.get(i),
+                        tf.get(this.locationNameEntities.get(i)) + 1);
+            else
+                tf.put(this.locationNameEntities.get(i), 1);
+        }
+        int max = 0;
+        List<Map.Entry<String, Integer>> list = new 
ArrayList<Map.Entry<String, Integer>>(
+                tf.entrySet());
+        Collections.shuffle(list);
+        Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
+            public int compare(Map.Entry<String, Integer> o1,
+                    Map.Entry<String, Integer> o2) {
+                // Descending Order
+                return o2.getValue().compareTo(o1.getValue());
+            }
+        });
+
+        this.locationNameEntities.clear();// update so that they are in
+                                          // descending order
+        for (Map.Entry<String, Integer> entry : list) {
+            this.locationNameEntities.add(entry.getKey());
+            if (entry.getValue() > max) {
+                max = entry.getValue();
+                this.bestNameEntity = entry.getKey();
+            }
+        }
+    }
+}

svn commit: r1723223 [23/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Reply via email to