Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ctakes; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * CTAKESParser decorates a {@see Parser} and leverages on + * {@see CTAKESContentHandler} to extract biomedical information from + * clinical text using Apache cTAKES. + * <p>It is normally called by supplying an instance to + * {@link AutoDetectParser}, such as: + * <code>AutoDetectParser parser = new AutoDetectParser(new CTAKESParser());</code> + * <p>It can also be used by giving a Tika Config file similar to: + * <code> + * <properties> + * <parsers> + * <parser class="org.apache.tika.parser.ctakes.CTAKESParser"> + * <parser class="org.apache.tika.parser.DefaultParser"/> + * </parser> + * </parsers> + * </properties> + * </code> + * <p>Because this is a Parser Decorator, and not a normal Parser in + * it's own right, it isn't normally selected via the Parser Service Loader. + */ +public class CTAKESParser extends ParserDecorator { + /** + * Serial version UID + */ + private static final long serialVersionUID = -2313482748027097961L; + + /** + * Wraps the default Parser + */ + public CTAKESParser() { + this(TikaConfig.getDefaultConfig()); + } + /** + * Wraps the default Parser for this Config + */ + public CTAKESParser(TikaConfig config) { + this(config.getParser()); + } + /** + * Wraps the specified Parser + */ + public CTAKESParser(Parser parser) { + super(parser); + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + CTAKESConfig config = context.get(CTAKESConfig.class, + new CTAKESConfig()); + CTAKESContentHandler ctakesHandler = new CTAKESContentHandler(handler, + metadata, config); + super.parse(stream, ctakesHandler, metadata, context); + } + + //@Override + public String getDecorationName() { + return "CTakes"; + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ctakes; + +import org.apache.uima.cas.impl.XCASSerializer; +import org.apache.uima.cas.impl.XmiCasSerializer; +import org.apache.uima.util.XmlCasSerializer; + +/** + * Enumeration for types of cTAKES (UIMA) CAS serializer supported by cTAKES. + * + * A CAS serializer writes a CAS in the given format. + */ +public enum CTAKESSerializer { + XCAS(XCASSerializer.class.getName()), + XMI(XmiCasSerializer.class.getName()), + XML(XmlCasSerializer.class.getName()); + + private final String className; + + private CTAKESSerializer(String className) { + this.className = className; + } + + public String getClassName() { + return className; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ctakes; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.URISyntaxException; + +import org.apache.ctakes.typesystem.type.refsem.UmlsConcept; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.uima.UIMAFramework; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.cas.impl.XCASSerializer; +import org.apache.uima.cas.impl.XmiCasSerializer; +import org.apache.uima.cas.impl.XmiSerializationSharedData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.apache.uima.util.InvalidXMLException; +import org.apache.uima.util.XMLInputSource; +import org.apache.uima.util.XmlCasSerializer; +import org.xml.sax.SAXException; + +/** + * This class provides methods to extract biomedical information from plain text + * using {@see CTAKESContentHandler} that relies on Apache cTAKES. + * + * <p> + * Apache cTAKES is built on top of <a href="https://uima.apache.org/">Apache + * UIMA</a> framework and <a href="https://opennlp.apache.org/">OpenNLP</a> + * toolkit. + * </p> + */ +public class CTAKESUtils { + // UMLS username property + private final static String CTAKES_UMLS_USER = "ctakes.umlsuser"; + + // UMLS password property + private final static String CTAKES_UMLS_PASS = "ctakes.umlspw"; + + /** + * Returns a new UIMA Analysis Engine (AE). This method ensures that only + * one instance of an AE is created. + * + * <p> + * An Analysis Engine is a component responsible for analyzing unstructured + * information, discovering and representing semantic content. Unstructured + * information includes, but is not restricted to, text documents. + * </p> + * + * @param aeDescriptor + * pathname for XML file including an AnalysisEngineDescription + * that contains all of the information needed to instantiate and + * use an AnalysisEngine. + * @param umlsUser + * UMLS username for NLM database + * @param umlsPass + * UMLS password for NLM database + * @return an Analysis Engine for analyzing unstructured information. + * @throws IOException + * if any I/O error occurs. + * @throws InvalidXMLException + * if the input XML is not valid or does not specify a valid + * ResourceSpecifier. + * @throws ResourceInitializationException + * if a failure occurred during production of the resource. + * @throws URISyntaxException + * if URL of the resource is not formatted strictly according to + * to RFC2396 and cannot be converted to a URI. + */ + public static AnalysisEngine getAnalysisEngine(String aeDescriptor, + String umlsUser, String umlsPass) throws IOException, + InvalidXMLException, ResourceInitializationException, + URISyntaxException { + // UMLS user ID and password. + String aeDescriptorPath = CTAKESUtils.class.getResource(aeDescriptor) + .toURI().getPath(); + + // get Resource Specifier from XML + XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath); + ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser() + .parseResourceSpecifier(aeIputSource); + + // UMLS user ID and password + if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != null) + && (!umlsPass.isEmpty())) { + /* + * It is highly recommended that you change UMLS credentials in the + * XML configuration file instead of giving user and password using + * CTAKESConfig. + */ + System.setProperty(CTAKES_UMLS_USER, umlsUser); + System.setProperty(CTAKES_UMLS_PASS, umlsPass); + } + + // create AE + AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(aeSpecifier); + + return ae; + } + + /** + * Returns a new JCas () appropriate for the given Analysis Engine. This + * method ensures that only one instance of a JCas is created. A Jcas is a + * Java Cover Classes based Object-oriented CAS (Common Analysis System) + * API. + * + * <p> + * Important: It is highly recommended that you reuse CAS objects rather + * than creating new CAS objects prior to each analysis. This is because CAS + * objects may be expensive to create and may consume a significant amount + * of memory. + * </p> + * + * @param ae + * AnalysisEngine used to create an appropriate JCas object. + * @return a JCas object appropriate for the given AnalysisEngine. + * @throws ResourceInitializationException + * if a CAS could not be created because this AnalysisEngine's + * CAS metadata (type system, type priorities, or FS indexes) + * are invalid. + */ + public static JCas getJCas(AnalysisEngine ae) + throws ResourceInitializationException { + JCas jcas = ae.newJCas(); + + return jcas; + } + + /** + * Serializes a CAS in the given format. + * + * @param jcas + * CAS (Common Analysis System) to be serialized. + * @param type + * type of cTAKES (UIMA) serializer used to write CAS. + * @param prettyPrint + * {@code true} to do pretty printing of output. + * @param stream + * {@see OutputStream} object used to print out information + * extracted by using cTAKES. + * @throws SAXException + * if there was a SAX exception. + * @throws IOException + * if any I/O error occurs. + */ + public static void serialize(JCas jcas, CTAKESSerializer type, boolean prettyPrint, + OutputStream stream) throws SAXException, IOException { + if (type == CTAKESSerializer.XCAS) { + XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint); + } else if (type == CTAKESSerializer.XMI) { + XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(), + stream, prettyPrint, new XmiSerializationSharedData()); + } else { + XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(), + stream); + } + } + + /** + * Returns the annotation value based on the given annotation type. + * + * @param annotation + * {@see IdentifiedAnnotation} object. + * @param property + * {@see CTAKESAnnotationProperty} enum used to identify the + * annotation type. + * @return the annotation value. + */ + public static String getAnnotationProperty(IdentifiedAnnotation annotation, + CTAKESAnnotationProperty property) { + String value = null; + if (property == CTAKESAnnotationProperty.BEGIN) { + value = Integer.toString(annotation.getBegin()); + } else if (property == CTAKESAnnotationProperty.END) { + value = Integer.toString(annotation.getEnd()); + } else if (property == CTAKESAnnotationProperty.CONDITIONAL) { + value = Boolean.toString(annotation.getConditional()); + } else if (property == CTAKESAnnotationProperty.CONFIDENCE) { + value = Float.toString(annotation.getConfidence()); + } else if (property == CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) { + value = Integer.toString(annotation.getDiscoveryTechnique()); + } else if (property == CTAKESAnnotationProperty.GENERIC) { + value = Boolean.toString(annotation.getGeneric()); + } else if (property == CTAKESAnnotationProperty.HISTORY_OF) { + value = Integer.toString(annotation.getHistoryOf()); + } else if (property == CTAKESAnnotationProperty.ID) { + value = Integer.toString(annotation.getId()); + } else if (property == CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) { + FSArray mentions = annotation.getOntologyConceptArr(); + StringBuilder sb = new StringBuilder(); + if (mentions != null) { + for (int i = 0; i < mentions.size(); i++) { + if (mentions.get(i) instanceof UmlsConcept) { + UmlsConcept concept = (UmlsConcept) mentions.get(i); + sb.append(concept.getCui()); + if (i < mentions.size() - 1) { + sb.append(","); + } + } + } + } + value = sb.toString(); + } else if (property == CTAKESAnnotationProperty.POLARITY) { + value = Integer.toString(annotation.getPolarity()); + } + return value; + } + + /** + * Resets cTAKES objects, if created. This method ensures that new cTAKES + * objects (a.k.a., Analysis Engine and JCas) will be created if getters of + * this class are called. + * + * @param ae UIMA Analysis Engine + * @param jcas JCas object + */ + public static void reset(AnalysisEngine ae, JCas jcas) { + // Analysis Engine + resetAE(ae); + + // JCas + resetCAS(jcas); + jcas = null; + } + + /** + * Resets the CAS (Common Analysis System), emptying it of all content. + * + * @param jcas JCas object + */ + public static void resetCAS(JCas jcas) { + if (jcas != null) { + jcas.reset(); + } + } + + /** + * Resets the AE (AnalysisEngine), releasing all resources held by the + * current AE. + * + * @param ae UIMA Analysis Engine + */ + public static void resetAE(AnalysisEngine ae) { + if (ae != null) { + ae.destroy(); + ae = null; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFContentHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dif; + +import java.util.Stack; + +import org.apache.tika.metadata.Metadata; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import org.xml.sax.helpers.DefaultHandler; + +public class DIFContentHandler extends DefaultHandler { + + private static final char[] NEWLINE = new char[] { '\n' }; + private static final char[] TABSPACE = new char[] { '\t' }; + private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); + + private Stack<String> treeStack; + private Stack<String> dataStack; + private final ContentHandler delegate; + private boolean isLeaf; + private Metadata metadata; + + public DIFContentHandler(ContentHandler delegate, Metadata metadata) { + this.delegate = delegate; + this.isLeaf = false; + this.metadata = metadata; + this.treeStack = new Stack<String>(); + this.dataStack = new Stack<String>(); + } + + @Override + public void setDocumentLocator(org.xml.sax.Locator locator) { + delegate.setDocumentLocator(locator); + } + + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + String value = (new String(ch, start, length)).toString(); + this.dataStack.push(value); + + if (this.treeStack.peek().equals("Entry_Title")) { + this.delegate.characters(NEWLINE, 0, NEWLINE.length); + this.delegate.characters(TABSPACE, 0, TABSPACE.length); + this.delegate.startElement("", "h3", "h3", EMPTY_ATTRIBUTES); + String title = "Title: "; + title = title + value; + this.delegate.characters(title.toCharArray(), 0, title.length()); + this.delegate.endElement("", "h3", "h3"); + } + if (this.treeStack.peek().equals("Southernmost_Latitude") + || this.treeStack.peek().equals("Northernmost_Latitude") + || this.treeStack.peek().equals("Westernmost_Longitude") + || this.treeStack.peek().equals("Easternmost_Longitude")) { + this.delegate.characters(NEWLINE, 0, NEWLINE.length); + this.delegate.characters(TABSPACE, 0, TABSPACE.length); + this.delegate.characters(TABSPACE, 0, TABSPACE.length); + this.delegate.startElement("", "tr", "tr", EMPTY_ATTRIBUTES); + this.delegate.startElement("", "td", "td", EMPTY_ATTRIBUTES); + String key = this.treeStack.peek() + " : "; + this.delegate.characters(key.toCharArray(), 0, key.length()); + this.delegate.endElement("", "td", "td"); + this.delegate.startElement("", "td", "td", EMPTY_ATTRIBUTES); + this.delegate.characters(value.toCharArray(), 0, value.length()); + this.delegate.endElement("", "td", "td"); + this.delegate.endElement("", "tr", "tr"); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + delegate.ignorableWhitespace(ch, start, length); + } + + @Override + public void startElement(String uri, String localName, String qName, + Attributes attributes) throws SAXException { + this.isLeaf = true; + if (localName.equals("Spatial_Coverage")) { + this.delegate.characters(NEWLINE, 0, NEWLINE.length); + this.delegate.characters(TABSPACE, 0, TABSPACE.length); + this.delegate.startElement("", "h3", "h3", EMPTY_ATTRIBUTES); + String value = "Geographic Data: "; + this.delegate.characters(value.toCharArray(), 0, value.length()); + this.delegate.endElement("", "h3", "h3"); + this.delegate.characters(NEWLINE, 0, NEWLINE.length); + this.delegate.characters(TABSPACE, 0, TABSPACE.length); + this.delegate.startElement("", "table", "table", EMPTY_ATTRIBUTES); + } + this.treeStack.push(localName); + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + if (localName.equals("Spatial_Coverage")) { + this.delegate.characters(NEWLINE, 0, NEWLINE.length); + this.delegate.characters(TABSPACE, 0, TABSPACE.length); + this.delegate.endElement("", "table", "table"); + } + if (this.isLeaf) { + Stack<String> tempStack = (Stack<String>) this.treeStack.clone(); + String key = ""; + while (!tempStack.isEmpty()) { + if (key.length() == 0) { + key = tempStack.pop(); + } else { + key = tempStack.pop() + "-" + key; + } + } + String value = this.dataStack.peek(); + this.metadata.add(key, value); + this.isLeaf = false; + } + this.treeStack.pop(); + this.dataStack.pop(); + } + + @Override + public void startDocument() throws SAXException { + delegate.startDocument(); + } + + @Override + public void endDocument() throws SAXException { + delegate.endDocument(); + } + + @Override + public String toString() { + return delegate.toString(); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/dif/DIFParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dif; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.TaggedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class DIFParser extends AbstractParser { + + /** + * + */ + private static final long serialVersionUID = 971505521275777826L; + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(MediaType.application("dif+xml")))); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + // TODO Auto-generated method stub + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + // TODO Auto-generated method stub + final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, + metadata); + xhtml.startDocument(); + xhtml.startElement("p"); + TaggedContentHandler tagged = new TaggedContentHandler(handler); + try { + context.getSAXParser().parse( + new CloseShieldInputStream(stream), + new OfflineContentHandler(new EmbeddedContentHandler( + getContentHandler(tagged, metadata, context)))); + } catch (SAXException e) { + tagged.throwIfCauseOf(e); + throw new TikaException("XML parse error", e); + } finally { + xhtml.endElement("p"); + xhtml.endDocument(); + } + + } + + protected ContentHandler getContentHandler(ContentHandler handler, + Metadata metadata, ParseContext context) { + + return new DIFContentHandler(handler, metadata); + + } + +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.apache.tika.parser.envi; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; +import java.nio.charset.Charset; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.sax.XHTMLContentHandler; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class EnviHeaderParser extends AbstractParser { + + private static final long serialVersionUID = -1479368523072408091L; + + public static final String ENVI_MIME_TYPE = "application/envi.hdr"; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .singleton(MediaType.application("envi.hdr")); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + // Only outputting the MIME type as metadata + metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE); + + // The following code was taken from the TXTParser + // Automatically detect the character encoding + + try (AutoDetectReader reader = new AutoDetectReader( + new CloseShieldInputStream(stream), metadata)) { + Charset charset = reader.getCharset(); + MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset); + // deprecated, see TIKA-431 + metadata.set(Metadata.CONTENT_ENCODING, charset.name()); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, + metadata); + + xhtml.startDocument(); + + // text contents of the xhtml + String line; + while ((line = reader.readLine()) != null) { + xhtml.startElement("p"); + xhtml.characters(line); + xhtml.endElement("p"); + } + + xhtml.endDocument(); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,415 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.gdal; + +//JDK imports +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Scanner; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.tika.parser.external.ExternalParser.INPUT_FILE_TOKEN; + +//Tika imports +//SAX imports + +/** + * Wraps execution of the <a href="http//gdal.org/">Geospatial Data Abstraction + * Library (GDAL)</a> <code>gdalinfo</code> tool used to extract geospatial + * information out of hundreds of geo file formats. + * <p/> + * The parser requires the installation of GDAL and for <code>gdalinfo</code> to + * be located on the path. + * <p/> + * Basic information (Size, Coordinate System, Bounding Box, Driver, and + * resource info) are extracted as metadata, and the remaining metadata patterns + * are extracted and added. + * <p/> + * The output of the command is available from the provided + * {@link ContentHandler} in the + * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} method. + */ +public class GDALParser extends AbstractParser { + + private static final long serialVersionUID = -3869130527323941401L; + + private String command; + + public GDALParser() { + setCommand("gdalinfo ${INPUT}"); + } + + public void setCommand(String command) { + this.command = command; + } + + public String getCommand() { + return this.command; + } + + public String processCommand(InputStream stream) { + TikaInputStream tis = (TikaInputStream) stream; + String pCommand = this.command; + try { + if (this.command.contains(INPUT_FILE_TOKEN)) { + pCommand = this.command.replace(INPUT_FILE_TOKEN, tis.getFile() + .getPath()); + } + } catch (Exception e) { + e.printStackTrace(); + } + + return pCommand; + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + Set<MediaType> types = new HashSet<MediaType>(); + types.add(MediaType.application("x-netcdf")); + types.add(MediaType.application("vrt")); + types.add(MediaType.image("geotiff")); + types.add(MediaType.image("nitf")); + types.add(MediaType.application("x-rpf-toc")); + types.add(MediaType.application("x-ecrg-toc")); + types.add(MediaType.image("hfa")); + types.add(MediaType.image("sar-ceos")); + types.add(MediaType.image("ceos")); + types.add(MediaType.application("jaxa-pal-sar")); + types.add(MediaType.application("gff")); + types.add(MediaType.application("elas")); + types.add(MediaType.application("aig")); + types.add(MediaType.application("aaigrid")); + types.add(MediaType.application("grass-ascii-grid")); + types.add(MediaType.application("sdts-raster")); + types.add(MediaType.application("dted")); + types.add(MediaType.image("png")); + types.add(MediaType.image("jpeg")); + types.add(MediaType.image("raster")); + types.add(MediaType.application("jdem")); + types.add(MediaType.image("gif")); + types.add(MediaType.image("big-gif")); + types.add(MediaType.image("envisat")); + types.add(MediaType.image("fits")); + types.add(MediaType.application("fits")); + types.add(MediaType.image("bsb")); + types.add(MediaType.application("xpm")); + types.add(MediaType.image("bmp")); + types.add(MediaType.image("x-dimap")); + types.add(MediaType.image("x-airsar")); + types.add(MediaType.application("x-rs2")); + types.add(MediaType.application("x-pcidsk")); + types.add(MediaType.application("pcisdk")); + types.add(MediaType.image("x-pcraster")); + types.add(MediaType.image("ilwis")); + types.add(MediaType.image("sgi")); + types.add(MediaType.application("x-srtmhgt")); + types.add(MediaType.application("leveller")); + types.add(MediaType.application("terragen")); + types.add(MediaType.application("x-gmt")); + types.add(MediaType.application("x-isis3")); + types.add(MediaType.application("x-isis2")); + types.add(MediaType.application("x-pds")); + types.add(MediaType.application("x-til")); + types.add(MediaType.application("x-ers")); + types.add(MediaType.application("x-l1b")); + types.add(MediaType.image("fit")); + types.add(MediaType.application("x-grib")); + types.add(MediaType.image("jp2")); + types.add(MediaType.application("x-rmf")); + types.add(MediaType.application("x-wcs")); + types.add(MediaType.application("x-wms")); + types.add(MediaType.application("x-msgn")); + types.add(MediaType.application("x-wms")); + types.add(MediaType.application("x-wms")); + types.add(MediaType.application("x-rst")); + types.add(MediaType.application("x-ingr")); + types.add(MediaType.application("x-gsag")); + types.add(MediaType.application("x-gsbg")); + types.add(MediaType.application("x-gs7bg")); + types.add(MediaType.application("x-cosar")); + types.add(MediaType.application("x-tsx")); + types.add(MediaType.application("x-coasp")); + types.add(MediaType.application("x-r")); + types.add(MediaType.application("x-map")); + types.add(MediaType.application("x-pnm")); + types.add(MediaType.application("x-doq1")); + types.add(MediaType.application("x-doq2")); + types.add(MediaType.application("x-envi")); + types.add(MediaType.application("x-envi-hdr")); + types.add(MediaType.application("x-generic-bin")); + types.add(MediaType.application("x-p-aux")); + types.add(MediaType.image("x-mff")); + types.add(MediaType.image("x-mff2")); + types.add(MediaType.image("x-fujibas")); + types.add(MediaType.application("x-gsc")); + types.add(MediaType.application("x-fast")); + types.add(MediaType.application("x-bt")); + types.add(MediaType.application("x-lan")); + types.add(MediaType.application("x-cpg")); + types.add(MediaType.image("ida")); + types.add(MediaType.application("x-ndf")); + types.add(MediaType.image("eir")); + types.add(MediaType.application("x-dipex")); + types.add(MediaType.application("x-lcp")); + types.add(MediaType.application("x-gtx")); + types.add(MediaType.application("x-los-las")); + types.add(MediaType.application("x-ntv2")); + types.add(MediaType.application("x-ctable2")); + types.add(MediaType.application("x-ace2")); + types.add(MediaType.application("x-snodas")); + types.add(MediaType.application("x-kro")); + types.add(MediaType.image("arg")); + types.add(MediaType.application("x-rik")); + types.add(MediaType.application("x-usgs-dem")); + types.add(MediaType.application("x-gxf")); + types.add(MediaType.application("x-dods")); + types.add(MediaType.application("x-http")); + types.add(MediaType.application("x-bag")); + types.add(MediaType.application("x-hdf")); + types.add(MediaType.image("x-hdf5-image")); + types.add(MediaType.application("x-nwt-grd")); + types.add(MediaType.application("x-nwt-grc")); + types.add(MediaType.image("adrg")); + types.add(MediaType.image("x-srp")); + types.add(MediaType.application("x-blx")); + types.add(MediaType.application("x-rasterlite")); + types.add(MediaType.application("x-epsilon")); + types.add(MediaType.application("x-sdat")); + types.add(MediaType.application("x-kml")); + types.add(MediaType.application("x-xyz")); + types.add(MediaType.application("x-geo-pdf")); + types.add(MediaType.image("x-ozi")); + types.add(MediaType.application("x-ctg")); + types.add(MediaType.application("x-e00-grid")); + types.add(MediaType.application("x-zmap")); + types.add(MediaType.application("x-webp")); + types.add(MediaType.application("x-ngs-geoid")); + types.add(MediaType.application("x-mbtiles")); + types.add(MediaType.application("x-ppi")); + types.add(MediaType.application("x-cappi")); + return types; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + if (!ExternalParser.check("gdalinfo")) { + return; + } + + // first set up and run GDAL + // process the command + TemporaryResources tmp = new TemporaryResources(); + TikaInputStream tis = TikaInputStream.get(stream, tmp); + + String runCommand = processCommand(tis); + String output = execCommand(new String[]{runCommand}); + + // now extract the actual metadata params + // from the GDAL output in the content stream + // to do this, we need to literally process the output + // from the invoked command b/c we can't read metadata and + // output text from the handler in ExternalParser + // at the same time, so for now, we can't use the + // ExternalParser to do this and I've had to bring some of + // that functionality directly into this class + // TODO: investigate a way to do both using ExternalParser + + extractMetFromOutput(output, metadata); + applyPatternsToOutput(output, metadata, getPatterns()); + + // make the content handler and provide output there + // now that we have metadata + processOutput(handler, metadata, output); + } + + private Map<Pattern, String> getPatterns() { + Map<Pattern, String> patterns = new HashMap<Pattern, String>(); + this.addPatternWithColon("Driver", patterns); + this.addPatternWithColon("Files", patterns); + this.addPatternWithIs("Size", patterns); + this.addPatternWithIs("Coordinate System", patterns); + this.addBoundingBoxPattern("Upper Left", patterns); + this.addBoundingBoxPattern("Lower Left", patterns); + this.addBoundingBoxPattern("Upper Right", patterns); + this.addBoundingBoxPattern("Lower Right", patterns); + return patterns; + } + + private void addPatternWithColon(String name, Map<Pattern, String> patterns) { + patterns.put( + Pattern.compile(name + "\\:\\s*([A-Za-z0-9/ _\\-\\.]+)\\s*"), + name); + } + + private void addPatternWithIs(String name, Map<Pattern, String> patterns) { + patterns.put(Pattern.compile(name + " is ([A-Za-z0-9\\.,\\s`']+)"), + name); + } + + private void addBoundingBoxPattern(String name, + Map<Pattern, String> patterns) { + patterns.put( + Pattern.compile(name + + "\\s*\\(\\s*([0-9]+\\.[0-9]+\\s*,\\s*[0-9]+\\.[0-9]+\\s*)\\)\\s*"), + name); + } + + private void extractMetFromOutput(String output, Metadata met) { + Scanner scanner = new Scanner(output); + String currentKey = null; + String[] headings = {"Subdatasets", "Corner Coordinates"}; + StringBuilder metVal = new StringBuilder(); + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + if (line.contains("=") || hasHeadings(line, headings)) { + if (currentKey != null) { + // time to flush this key and met val + met.add(currentKey, metVal.toString()); + } + metVal.setLength(0); + + String[] lineToks = line.split("="); + currentKey = lineToks[0].trim(); + if (lineToks.length == 2) { + metVal.append(lineToks[1]); + } else { + metVal.append(""); + } + } else { + metVal.append(line); + } + + } + } + + private boolean hasHeadings(String line, String[] headings) { + if (headings != null && headings.length > 0) { + for (String heading : headings) { + if (line.contains(heading)) { + return true; + } + } + return false; + } else return false; + } + + private void applyPatternsToOutput(String output, Metadata metadata, + Map<Pattern, String> metadataPatterns) { + Scanner scanner = new Scanner(output); + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + for (Pattern p : metadataPatterns.keySet()) { + Matcher m = p.matcher(line); + if (m.find()) { + if (metadataPatterns.get(p) != null + && !metadataPatterns.get(p).equals("")) { + metadata.add(metadataPatterns.get(p), m.group(1)); + } else { + metadata.add(m.group(1), m.group(2)); + } + } + } + } + + } + + private String execCommand(String[] cmd) throws IOException { + // Execute + Process process; + String output = null; + if (cmd.length == 1) { + process = Runtime.getRuntime().exec(cmd[0]); + } else { + process = Runtime.getRuntime().exec(cmd); + } + + try { + InputStream out = process.getInputStream(); + + try { + output = extractOutput(out); + } catch (Exception e) { + e.printStackTrace(); + output = ""; + } + + } finally { + try { + process.waitFor(); + } catch (InterruptedException ignore) { + } + } + return output; + + } + + private String extractOutput(InputStream stream) throws SAXException, + IOException { + StringBuilder sb = new StringBuilder(); + try (Reader reader = new InputStreamReader(stream, UTF_8)) { + char[] buffer = new char[1024]; + for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { + sb.append(buffer, 0, n); + } + } + return sb.toString(); + } + + private void processOutput(ContentHandler handler, Metadata metadata, + String output) throws SAXException, IOException { + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8)); + try (Reader reader = new InputStreamReader(stream, UTF_8)) { + xhtml.startDocument(); + xhtml.startElement("p"); + char[] buffer = new char[1024]; + for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { + xhtml.characters(buffer, 0, n); + } + xhtml.endElement("p"); + + } finally { + xhtml.endDocument(); + } + + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright owlocationNameEntitieship. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.geo.topic; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Set; +import java.util.logging.Logger; + +import org.apache.commons.exec.CommandLine; +import org.apache.commons.exec.DefaultExecutor; +import org.apache.commons.exec.ExecuteException; +import org.apache.commons.exec.ExecuteWatchdog; +import org.apache.commons.exec.PumpStreamHandler; +import org.apache.commons.exec.environment.EnvironmentUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.external.ExternalParser; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.JSONValue; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class GeoParser extends AbstractParser { + private static final long serialVersionUID = -2241391757440215491L; + private static final Logger LOG = Logger.getLogger(GeoParser.class.getName()); + private static final MediaType MEDIA_TYPE = + MediaType.application("geotopic"); + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MEDIA_TYPE); + + private GeoParserConfig config = new GeoParserConfig(); + + private boolean initialized; + private URL modelUrl; + private NameEntityExtractor extractor; + private boolean available; + + @Override + public Set<MediaType> getSupportedTypes(ParseContext parseContext) { + return SUPPORTED_TYPES; + } + + /** + * Initializes this parser + * @param modelUrl the URL to NER model + */ + public void initialize(URL modelUrl) { + if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) { + // Previously initialized for the same URL, no initialization needed + return; + } + + this.modelUrl = modelUrl; + + // Check if the NER model is available, and if the + // lucene-geo-gazetteer is available + this.available = modelUrl != null && ExternalParser.check( + new String[] { "lucene-geo-gazetteer", "--help" }, -1); + if (this.available) { + try { + this.extractor = new NameEntityExtractor(modelUrl); + } catch (Exception e) { + LOG.warning("Named Entity Extractor setup failed: " + e); + this.available = false; + } + } + initialized = true; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + /*----------------configure this parser by ParseContext Object---------------------*/ + + this.config = context.get(GeoParserConfig.class, config); + initialize(this.config.getNerModelUrl()); + if (!isAvailable()) { + return; + } + + /*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/ + extractor.getAllNameEntitiesfromInput(stream); + extractor.getBestNameEntity(); + ArrayList<String> locationNameEntities = extractor.locationNameEntities; + String bestner = extractor.bestNameEntity; + + /*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/ + HashMap<String, ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities); + + /*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/ + GeoTag geotag = new GeoTag(); + geotag.toGeoTag(resolvedGeonames, bestner); + + /* add resolved entities in metadata */ + + metadata.add("Geographic_NAME", geotag.Geographic_NAME); + metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE); + metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE); + for (int i = 0; i < geotag.alternatives.size(); ++i) { + GeoTag alter = (GeoTag) geotag.alternatives.get(i); + metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME); + metadata.add("Optional_LONGITUDE" + (i + 1), + alter.Geographic_LONGTITUDE); + metadata.add("Optional_LATITUDE" + (i + 1), + alter.Geographic_LATITUDE); + } + } + + public HashMap<String, ArrayList<String>> searchGeoNames( + ArrayList<String> locationNameEntities) throws ExecuteException, + IOException { + CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer"); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + cmdLine.addArgument("-s"); + for (String name : locationNameEntities) { + cmdLine.addArgument(name); + } + + LOG.fine("Executing: " + cmdLine); + DefaultExecutor exec = new DefaultExecutor(); + exec.setExitValue(0); + ExecuteWatchdog watchdog = new ExecuteWatchdog(60000); + exec.setWatchdog(watchdog); + PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream); + exec.setStreamHandler(streamHandler); + int exitValue = exec.execute(cmdLine, EnvironmentUtils.getProcEnvironment()); + String outputJson = outputStream.toString("UTF-8"); + JSONArray json = (JSONArray) JSONValue.parse(outputJson); + + HashMap<String, ArrayList<String>> returnHash = new HashMap<String, ArrayList<String>>(); + for (int i = 0; i < json.size(); i++) { + JSONObject obj = (JSONObject) json.get(i); + for (Object key : obj.keySet()) { + String theKey = (String) key; + JSONArray vals = (JSONArray) obj.get(theKey); + ArrayList<String> stringVals = new ArrayList<String>( + vals.size()); + for (int j = 0; j < vals.size(); j++) { + String val = (String) vals.get(j); + stringVals.add(val); + } + + returnHash.put(theKey, stringVals); + } + } + + return returnHash; + } + + public boolean isAvailable() { + if (!initialized) { + initialize(config.getNerModelUrl()); + } + return this.available; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.geo.topic; + +import java.io.File; +import java.io.Serializable; +import java.net.MalformedURLException; +import java.net.URL; + +public class GeoParserConfig implements Serializable { + private static final long serialVersionUID = -3167692634278575818L; + private URL nerModelUrl = null; + + public GeoParserConfig() { + this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin"); + } + + public void setNERModelPath(String path) { + if (path == null) + return; + File file = new File(path); + if (file.isDirectory() || !file.exists()) { + return; + } + try { + this.nerModelUrl = file.toURI().toURL(); + } catch (MalformedURLException e) { + throw new RuntimeException(e); + } + } + + public void setNerModelUrl(URL url) { + this.nerModelUrl = url; + } + public URL getNerModelUrl() { + return nerModelUrl; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.geo.topic; + +import java.util.ArrayList; +import java.util.HashMap; + +public class GeoTag { + String Geographic_NAME; + String Geographic_LONGTITUDE; + String Geographic_LATITUDE; + ArrayList<GeoTag> alternatives = new ArrayList<GeoTag>(); + + public void setMain(String name, String longitude, String latitude) { + Geographic_NAME = name; + Geographic_LONGTITUDE = longitude; + Geographic_LATITUDE = latitude; + } + + public void addAlternative(GeoTag geotag) { + alternatives.add(geotag); + } + + /* + * Store resolved geoName entities in a GeoTag + * + * @param resolvedGeonames resolved entities + * + * @param bestNER best name entity among all the extracted entities for the + * input stream + */ + public void toGeoTag(HashMap<String, ArrayList<String>> resolvedGeonames, + String bestNER) { + + for (String key : resolvedGeonames.keySet()) { + ArrayList<String> cur = resolvedGeonames.get(key); + if (key.equals(bestNER)) { + this.Geographic_NAME = cur.get(0); + this.Geographic_LONGTITUDE = cur.get(1); + this.Geographic_LATITUDE = cur.get(2); + } else { + GeoTag alter = new GeoTag(); + alter.Geographic_NAME = cur.get(0); + alter.Geographic_LONGTITUDE = cur.get(1); + alter.Geographic_LATITUDE = cur.get(2); + this.addAlternative(alter); + } + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.geo.topic; + + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.util.Span; +import org.apache.commons.io.IOUtils; + +import static java.nio.charset.StandardCharsets.UTF_8; + +public class NameEntityExtractor { + ArrayList<String> locationNameEntities; + String bestNameEntity; + private HashMap<String, Integer> tf; + private final NameFinderME nameFinder; + + public NameEntityExtractor(URL modelUrl) throws IOException { + this.locationNameEntities = new ArrayList<String>(); + this.bestNameEntity = null; + TokenNameFinderModel model = new TokenNameFinderModel(modelUrl); + this.nameFinder = new NameFinderME(model); + this.tf = new HashMap<String, Integer>(); + } + + /* + * Use OpenNLP to extract location names that's appearing in the steam. + * OpenNLP's default Name Finder accuracy is not very good, please refer to + * its documentation. + * + * @param stream stream that passed from this.parse() + */ + public void getAllNameEntitiesfromInput(InputStream stream) throws IOException { + String[] in = IOUtils.toString(stream, UTF_8).split(" "); + Span nameE[]; + + //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind + synchronized (nameFinder) { + nameE = nameFinder.find(in); + //the same name finder is reused, so clear adaptive data + nameFinder.clearAdaptiveData(); + } + + String spanNames = Arrays.toString(Span.spansToStrings(nameE, in)); + spanNames = spanNames.substring(1, spanNames.length() - 1); + String[] tmp = spanNames.split(","); + + for (String name : tmp) { + name = name.trim(); + this.locationNameEntities.add(name); + } + + + } + + /* + * Get the best location entity extracted from the input stream. Simply + * return the most frequent entity, If there several highest frequent + * entity, pick one randomly. May not be the optimal solution, but works. + * + * @param locationNameEntities OpenNLP name finder's results, stored in + * ArrayList + */ + public void getBestNameEntity() { + if (this.locationNameEntities.size() == 0) + return; + + for (int i = 0; i < this.locationNameEntities.size(); ++i) { + if (tf.containsKey(this.locationNameEntities.get(i))) + tf.put(this.locationNameEntities.get(i), + tf.get(this.locationNameEntities.get(i)) + 1); + else + tf.put(this.locationNameEntities.get(i), 1); + } + int max = 0; + List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>( + tf.entrySet()); + Collections.shuffle(list); + Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() { + public int compare(Map.Entry<String, Integer> o1, + Map.Entry<String, Integer> o2) { + // Descending Order + return o2.getValue().compareTo(o1.getValue()); + } + }); + + this.locationNameEntities.clear();// update so that they are in + // descending order + for (Map.Entry<String, Integer> entry : list) { + this.locationNameEntities.add(entry.getKey()); + if (entry.getValue() > max) { + max = entry.getValue(); + this.bestNameEntity = entry.getKey(); + } + } + } +}
