Author: mattmann
Date: Sat Jun 6 23:29:52 2015
New Revision: 1683968
URL: http://svn.apache.org/r1683968
Log:
Fix for TIKA-1645 & TIKA-1642: Extraction of biomedical information using
CTAKESParser contributed by Selina Chu, Giuseppe Totaro and mattmann.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-bundle/pom.xml
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1683968&r1=1683967&r2=1683968&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Jun 6 23:29:52 2015
@@ -1,5 +1,9 @@
Release 1.9 - 6/6/2015
+ * The ability to use the cTAKES clinical text
+ knowledge extraction system for biomedical data is
+ now included as a Tika parser (TIKA-1645, TIKA-1642).
+
* Tika-server allows a user to specify the Tika config
from the command line (TIKA-1652, TIKA-1426).
Modified: tika/trunk/tika-bundle/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1683968&r1=1683967&r2=1683968&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Sat Jun 6 23:29:52 2015
@@ -151,7 +151,9 @@
org.apache.tika.parser.*,
</Export-Package>
<Import-Package>
- !org.junit,
+ !org.junit,
+ !org.apache.ctakes.*,
+ !org.apache.uima.*,
*,
org.apache.tika.fork,
android.util;resolution:=optional,
Modified: tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1683968&r1=1683967&r2=1683968&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Sat Jun 6 23:29:52 2015
@@ -326,6 +326,13 @@
<artifactId>geoapi</artifactId>
<version>3.0.0</version>
</dependency>
+ <!-- Apache cTAKES -->
+ <dependency>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-core</artifactId>
+ <version>3.2.2</version>
+ <scope>provided</scope>
+ </dependency>
</dependencies>
<build>
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java?rev=1683968&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
Sat Jun 6 23:29:52 2015
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+
+/**
+ * This enumeration includes the properties that an {@see
IdentifiedAnnotation} object can provide.
+ *
+ */
+public enum CTAKESAnnotationProperty {
+ BEGIN("start"),
+ END("end"),
+ CONDITIONAL("conditional"),
+ CONFIDENCE("confidence"),
+ DISCOVERY_TECNIQUE("discoveryTechnique"),
+ GENERIC("generic"),
+ HISTORY_OF("historyOf"),
+ ID("id"),
+ ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"),
+ POLARITY("polarity");
+
+ private String name;
+
+ CTAKESAnnotationProperty(String name) {
+ this.name = name;
+ }
+
+ public String getName() {
+ return name;
+ }
+}
\ No newline at end of file
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java?rev=1683968&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
Sat Jun 6 23:29:52 2015
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.util.Properties;
+
+import org.apache.tika.io.NullOutputStream;
+
+/*
+ * Configuration for {@see CTAKESContentHandler}.
+ *
+ * This class allows to enable cTAKES and set its parameters.
+ *
+ */
+public class CTAKESConfig implements Serializable {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1599741171775528923L;
+
+ // Path to XML descriptor for AnalysisEngine
+ private String aeDescriptorPath =
"/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml";
+
+ // UMLS username
+ private String UMLSUser = "";
+
+ // UMLS password
+ private String UMLSPass = "";
+
+ // Enables formatted output
+ private boolean prettyPrint = true;
+
+ // Type of cTAKES (UIMA) serializer
+ private CTAKESSerializer serializerType = CTAKESSerializer.XMI;
+
+ // OutputStream object used for CAS serialization
+ private OutputStream stream = NullOutputStream.NULL_OUTPUT_STREAM;
+
+ // Enables CAS serialization
+ private boolean serialize = false;
+
+ // Enables text analysis using cTAKES
+ private boolean text = true;
+
+ // List of metadata to analyze using cTAKES
+ private String[] metadata = null;
+
+ // List of annotation properties to add to metadata in addition to text
covered by an annotation
+ private CTAKESAnnotationProperty[] annotationProps = null;
+
+ // Character used to separate the annotation properties into metadata
+ private char separatorChar = ':';
+
+ /**
+ * Default constructor.
+ */
+ public CTAKESConfig() {
+
init(this.getClass().getResourceAsStream("CTAKESConfig.properties"));
+ }
+
+ /**
+ * Loads properties from InputStream and then tries to close
InputStream.
+ * @param stream {@see InputStream} object used to read properties.
+ */
+ public CTAKESConfig(InputStream stream) {
+ init(stream);
+ }
+
+ private void init(InputStream stream) {
+ if (stream == null) {
+ return;
+ }
+ Properties props = new Properties();
+
+ try {
+ props.load(stream);
+ } catch (IOException e) {
+ // TODO warning
+ } finally {
+ if (stream != null) {
+ try {
+ stream.close();
+ } catch (IOException ioe) {
+ // TODO warning
+ }
+ }
+ }
+
+ setAeDescriptorPath(props.getProperty("aeDescriptorPath",
getAeDescriptorPath()));
+ setUMLSUser(props.getProperty("UMLSUser", getUMLSUser()));
+ setUMLSPass(props.getProperty("UMLSPass", getUMLSPass()));
+ setText(Boolean.valueOf(props.getProperty("text",
Boolean.toString(isText()))));
+ setMetadata(props.getProperty("metadata",
getMetadataAsString()).split(","));
+ setAnnotationProps(props.getProperty("annotationProps",
getAnnotationPropsAsString()).split(","));
+ setSeparatorChar(props.getProperty("separatorChar",
Character.toString(getSeparatorChar())).charAt(0));
+ }
+
+ /**
+ * Returns the path to XML descriptor for AnalysisEngine.
+ * @return the path to XML descriptor for AnalysisEngine.
+ */
+ public String getAeDescriptorPath() {
+ return aeDescriptorPath;
+ }
+
+ /**
+ * Returns the UMLS username.
+ * @return the UMLS username.
+ */
+ public String getUMLSUser() {
+ return UMLSUser;
+ }
+
+ /**
+ * Returns the UMLS password.
+ * @return the UMLS password.
+ */
+ public String getUMLSPass() {
+ return UMLSPass;
+ }
+
+ /**
+ * Returns {@code true} if formatted output is enabled, {@code false}
otherwise.
+ * @return {@code true} if formatted output is enabled, {@code false}
otherwise.
+ */
+ public boolean isPrettyPrint() {
+ return prettyPrint;
+ }
+
+ /**
+ * Returns the type of cTAKES (UIMA) serializer used to write the CAS.
+ * @return the type of cTAKES serializer.
+ */
+ public CTAKESSerializer getSerializerType() {
+ return serializerType;
+ }
+
+ /**
+ * Returns an {@see OutputStream} object used write the CAS.
+ * @return {@see OutputStream} object used write the CAS.
+ */
+ public OutputStream getOutputStream() {
+ return stream;
+ }
+
+ /**
+ * Returns {@code true} if CAS serialization is enabled, {@code false}
otherwise.
+ * @return {@code true} if CAS serialization output is enabled, {@code
false} otherwise.
+ */
+ public boolean isSerialize() {
+ return serialize;
+ }
+
+ /**
+ * Returns {@code true} if content text analysis is enabled {@code
false} otherwise.
+ * @return {@code true} if content text analysis is enabled {@code
false} otherwise.
+ */
+ public boolean isText() {
+ return text;
+ }
+
+ /**
+ * Returns an array of metadata whose values will be analyzed using
cTAKES.
+ * @return an array of metadata whose values will be analyzed using
cTAKES.
+ */
+ public String[] getMetadata() {
+ return metadata;
+ }
+
+ /**
+ * Returns a string containing a comma-separated list of metadata whose
values will be analyzed using cTAKES.
+ * @return a string containing a comma-separated list of metadata whose
values will be analyzed using cTAKES.
+ */
+ public String getMetadataAsString() {
+ if (metadata == null) {
+ return "";
+ }
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < metadata.length; i++) {
+ sb.append(metadata[i]);
+ if (i < metadata.length-1) {
+ sb.append(",");
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Returns an array of {@see CTAKESAnnotationProperty}'s that will be
included into cTAKES metadata.
+ * @return an array of {@see CTAKESAnnotationProperty}'s that will be
included into cTAKES metadata.
+ */
+ public CTAKESAnnotationProperty[] getAnnotationProps() {
+ return annotationProps;
+ }
+
+ /**
+ * Returns a string containing a comma-separated list of {@see
CTAKESAnnotationProperty} names that will be included into cTAKES metadata.
+ * @return
+ */
+ public String getAnnotationPropsAsString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("coveredText");
+ if (annotationProps != null) {
+ for (CTAKESAnnotationProperty property :
annotationProps) {
+ sb.append(separatorChar);
+ sb.append(property.getName());
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Returns the separator character used for annotation properties.
+ * @return the separator character used for annotation properties.
+ */
+ public char getSeparatorChar() {
+ return separatorChar;
+ }
+
+ /**
+ * Sets the path to XML descriptor for AnalysisEngine.
+ * @param aeDescriptorPath the path to XML descriptor for
AnalysisEngine.
+ */
+ public void setAeDescriptorPath(String aeDescriptorPath) {
+ this.aeDescriptorPath = aeDescriptorPath;
+ }
+
+ /**
+ * Sets the UMLS username.
+ * @param uMLSUser the UMLS username.
+ */
+ public void setUMLSUser(String uMLSUser) {
+ this.UMLSUser = uMLSUser;
+ }
+
+ /**
+ * Sets the UMLS password.
+ * @param uMLSPass the UMLS password.
+ */
+ public void setUMLSPass(String uMLSPass) {
+ this.UMLSPass = uMLSPass;
+ }
+
+ /**
+ * Enables the formatted output for serializer.
+ * @param prettyPrint {@true} to enable formatted output, {@code false}
otherwise.
+ */
+ public void setPrettyPrint(boolean prettyPrint) {
+ this.prettyPrint = prettyPrint;
+ }
+
+ /**
+ * Sets the type of cTAKES (UIMA) serializer used to write CAS.
+ * @param serializerType the type of cTAKES serializer.
+ */
+ public void setSerializerType(CTAKESSerializer serializerType) {
+ this.serializerType = serializerType;
+ }
+
+ /**
+ * Sets the {@see OutputStream} object used to write the CAS.
+ * @param stream the {@see OutputStream} object used to write the CAS.
+ */
+ public void setOutputStream(OutputStream stream) {
+ this.stream = stream;
+ }
+
+ /**
+ * Enables CAS serialization.
+ * @param serialize {@true} to enable CAS serialization, {@code false}
otherwise.
+ */
+ public void setSerialize(boolean serialize) {
+ this.serialize = serialize;
+ }
+
+ /**
+ * Enables content text analysis using cTAKES.
+ * @param text {@true} to enable content text analysis, {@code false}
otherwise.
+ */
+ public void setText(boolean text) {
+ this.text = text;
+ }
+
+ /**
+ * Sets the metadata whose values will be analyzed using cTAKES.
+ * @param metadata the metadata whose values will be analyzed using
cTAKES.
+ */
+ public void setMetadata(String[] metadata) {
+ this.metadata = metadata;
+ }
+
+ /**
+ * Sets the {@see CTAKESAnnotationProperty}'s that will be included
into cTAKES metadata.
+ * @param annotationProps the {@see CTAKESAnnotationProperty}'s that
will be included into cTAKES metadata.
+ */
+ public void setAnnotationProps(CTAKESAnnotationProperty[]
annotationProps) {
+ this.annotationProps = annotationProps;
+ }
+
+ /**
+ * ets the {@see CTAKESAnnotationProperty}'s that will be included into
cTAKES metadata.
+ * @param annotationProps the {@see CTAKESAnnotationProperty}'s that
will be included into cTAKES metadata.
+ */
+ public void setAnnotationProps(String[] annotationProps) {
+ CTAKESAnnotationProperty[] properties = new
CTAKESAnnotationProperty[annotationProps.length];
+ for (int i = 0; i < annotationProps.length; i++) {
+ properties[i] =
CTAKESAnnotationProperty.valueOf(annotationProps[i]);
+ }
+ setAnnotationProps(properties);
+ }
+
+ /**
+ * Sets the separator character used for annotation properties.
+ * @param separatorChar the separator character used for annotation
properties.
+ */
+ public void setSeparatorChar(char separatorChar) {
+ this.separatorChar = separatorChar;
+ }
+}
\ No newline at end of file
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java?rev=1683968&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
Sat Jun 6 23:29:52 2015
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Class used to extract biomedical information while parsing.
+ *
+ * <p>
+ * This class relies on <a href="http://ctakes.apache.org/">Apache cTAKES</a>
+ * that is a natural language processing system for extraction of information
+ * from electronic medical record clinical free-text.
+ * </p>
+ *
+ */
+public class CTAKESContentHandler extends ContentHandlerDecorator {
+ // Prefix used for metadata including cTAKES annotations
+ public static String CTAKES_META_PREFIX = "ctakes:";
+
+ // Configuration object for CTAKESContentHandler
+ private CTAKESConfig config = null;
+
+ // StringBuilder object used to build the clinical free-text for cTAKES
+ private StringBuilder sb = null;
+
+ // Metadata object used for cTAKES annotations
+ private Metadata metadata = null;
+
+ /**
+ * Creates a new {@see CTAKESContentHandler} for the given {@see
ContentHandler} and Metadata objects.
+ * @param handler the {@see ContentHandler} object to be decorated.
+ * @param metadata the {@see Metadata} object that will be populated
using biomedical information extracted by cTAKES.
+ * @param config the {@see CTAKESConfig} object used to configure the
handler.
+ */
+ public CTAKESContentHandler(ContentHandler handler, Metadata metadata,
CTAKESConfig config) {
+ super(handler);
+ this.metadata = metadata;
+ this.config = config;
+ this.sb = new StringBuilder();
+ }
+
+ /**
+ * Creates a new {@see CTAKESContentHandler} for the given {@see
ContentHandler} and Metadata objects.
+ * @param handler the {@see ContentHandler} object to be decorated.
+ * @param metadata the {@see Metadata} object that will be populated
using biomedical information extracted by cTAKES.
+ */
+ public CTAKESContentHandler(ContentHandler handler, Metadata metadata) {
+ this(handler, metadata, new CTAKESConfig());
+ }
+
+ /**
+ * Default constructor.
+ */
+ public CTAKESContentHandler() {
+ this(new DefaultHandler(), new Metadata());
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws
SAXException {
+ if (config.isText()) {
+ sb.append(ch, start, length);
+ }
+ super.characters(ch, start, length);
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ try {
+ // create an Analysis Engine
+ AnalysisEngine ae =
CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(),
config.getUMLSUser(), config.getUMLSPass());
+
+ // create a JCas, given an AE
+ JCas jcas = CTAKESUtils.getJCas(ae);
+
+ StringBuilder metaText = new StringBuilder();
+ for (String name : config.getMetadata()) {
+ for (String value : metadata.getValues(name)) {
+ metaText.append(value);
+ metaText.append(System.lineSeparator());
+ }
+ }
+
+ // analyze text
+ jcas.setDocumentText(metaText.toString() +
sb.toString());
+ ae.process(jcas);
+
+ // add annotations to metadata
+ metadata.add(CTAKES_META_PREFIX + "schema",
config.getAnnotationPropsAsString());
+ CTAKESAnnotationProperty[] annotationPros =
config.getAnnotationProps();
+ Collection<IdentifiedAnnotation> collection =
JCasUtil.select(jcas, IdentifiedAnnotation.class);
+ Iterator<IdentifiedAnnotation> iterator =
collection.iterator();
+ while (iterator.hasNext()) {
+ IdentifiedAnnotation annotation =
iterator.next();
+ StringBuilder annotationBuilder = new
StringBuilder();
+
annotationBuilder.append(annotation.getCoveredText());
+ if (annotationPros != null) {
+ for (CTAKESAnnotationProperty property
: annotationPros) {
+
annotationBuilder.append(config.getSeparatorChar());
+
annotationBuilder.append(CTAKESUtils.getAnnotationProperty(annotation,
property));
+ }
+ }
+ metadata.add(CTAKES_META_PREFIX +
annotation.getType().getShortName(), annotationBuilder.toString());
+ }
+
+ if (config.isSerialize()) {
+ // serialize data
+
CTAKESUtils.serialize(config.getSerializerType(), config.isPrettyPrint(),
config.getOutputStream());
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new SAXException(e.getMessage());
+ }
+ }
+
+ /**
+ * Returns metadata that includes cTAKES annotations.
+ * @return {@Metadata} object that includes cTAKES annotations.
+ */
+ public Metadata getMetadata() {
+ return metadata;
+ }
+}
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1683968&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
Sat Jun 6 23:29:52 2015
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ParserDecorator;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * CTAKESParser decorates {@see AutoDetectParser} and leverages on {@see
+ * CTAKESContentHandler} to extract biomedical information from clinical text
using Apache cTAKES.
+ *
+ */
+public class CTAKESParser extends ParserDecorator {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -2313482748027097961L;
+
+ /**
+ * Default constructor.
+ */
+ public CTAKESParser() {
+ super(new AutoDetectParser());
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws
IOException,
+ SAXException, TikaException {
+ CTAKESConfig config = context.get(CTAKESConfig.class,
+ new CTAKESConfig());
+ CTAKESContentHandler ctakesHandler = new
CTAKESContentHandler(handler,
+ metadata, config);
+ super.parse(stream, ctakesHandler, metadata, context);
+ }
+}
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java?rev=1683968&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
Sat Jun 6 23:29:52 2015
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.util.XmlCasSerializer;
+
+/**
+ * Enumeration for types of cTAKES (UIMA) CAS serializer supported by cTAKES.
+ *
+ * A CAS serializer writes a CAS in the given format.
+ *
+ */
+public enum CTAKESSerializer {
+ XCAS(XCASSerializer.class.getName()),
+ XMI(XmiCasSerializer.class.getName()),
+ XML(XmlCasSerializer.class.getName());
+
+ private final String className;
+
+ private CTAKESSerializer(String className) {
+ this.className = className;
+ }
+
+ public String getClassName() {
+ return className;
+ }
+}
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java?rev=1683968&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
Sat Jun 6 23:29:52 2015
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URISyntaxException;
+
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.cas.impl.XmiSerializationSharedData;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XmlCasSerializer;
+import org.xml.sax.SAXException;
+
+/**
+ * This class provides methods to extract biomedical information from plain
text
+ * using {@see CTAKESContentHandler} that relies on Apache cTAKES.
+ *
+ * <p>
+ * Apache cTAKES is built on top of <a href="https://uima.apache.org/">Apache
+ * UIMA</a> framework and <a href="https://opennlp.apache.org/">OpenNLP</a>
+ * toolkit.
+ * </p>
+ *
+ */
+public class CTAKESUtils {
+ // UIMA Analysis Engine
+ private static AnalysisEngine ae = null;
+
+ // JCas object for working with the CAS (Common Analysis System)
+ private static JCas jcas = null;
+
+ // UMLS username property
+ private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
+
+ // UMLS password property
+ private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
+
+ /**
+ * Returns a new UIMA Analysis Engine (AE). This method ensures that
only
+ * one instance of an AE is created.
+ *
+ * <p>
+ * An Analysis Engine is a component responsible for analyzing
unstructured
+ * information, discovering and representing semantic content.
Unstructured
+ * information includes, but is not restricted to, text documents.
+ * </p>
+ *
+ * @param aeDescriptor
+ * pathname for XML file including an
AnalysisEngineDescription
+ * that contains all of the information needed to
instantiate and
+ * use an AnalysisEngine.
+ * @param umlsUser
+ * UMLS username for NLM database
+ * @param umlsPass
+ * UMLS password for NLM database
+ * @return an Analysis Engine for analyzing unstructured information.
+ * @throws IOException
+ * if any I/O error occurs.
+ * @throws InvalidXMLException
+ * if the input XML is not valid or does not specify a valid
+ * ResourceSpecifier.
+ * @throws ResourceInitializationException
+ * if a failure occurred during production of the resource.
+ * @throws URISyntaxException
+ * if URL of the resource is not formatted strictly
according to
+ * to RFC2396 and cannot be converted to a URI.
+ */
+ public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
+ String umlsUser, String umlsPass) throws IOException,
+ InvalidXMLException, ResourceInitializationException,
+ URISyntaxException {
+ if (ae == null) {
+ // UMLS user ID and password.
+ String aeDescriptorPath = CTAKESUtils.class
+
.getResource(aeDescriptor).toURI().getPath();
+
+ // get Resource Specifier from XML
+ XMLInputSource aeIputSource = new
XMLInputSource(aeDescriptorPath);
+ ResourceSpecifier aeSpecifier =
UIMAFramework.getXMLParser()
+ .parseResourceSpecifier(aeIputSource);
+
+ // UMLS user ID and password
+ if ((umlsUser != null) && (!umlsUser.isEmpty())
+ && (umlsPass != null) &&
(!umlsPass.isEmpty())) {
+ /*
+ * It is highly recommended that you change
UMLS credentials in
+ * the XML configuration file instead of giving
user and
+ * password using CTAKESConfig.
+ */
+ System.setProperty(CTAKES_UMLS_USER, umlsUser);
+ System.setProperty(CTAKES_UMLS_PASS, umlsPass);
+ }
+
+ // create AE
+ ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
+ }
+ return ae;
+ }
+
+ /**
+ * Returns a new JCas () appropriate for the given Analysis Engine. This
+ * method ensures that only one instance of a JCas is created. A Jcas
is a
+ * Java Cover Classes based Object-oriented CAS (Common Analysis System)
+ * API.
+ *
+ * <p>
+ * Important: It is highly recommended that you reuse CAS objects rather
+ * than creating new CAS objects prior to each analysis. This is
because CAS
+ * objects may be expensive to create and may consume a significant
amount
+ * of memory.
+ * </p>
+ *
+ * @param ae
+ * AnalysisEngine used to create an appropriate JCas object.
+ * @return a JCas object appropriate for the given AnalysisEngine.
+ * @throws ResourceInitializationException
+ * if a CAS could not be created because this
AnalysisEngine's
+ * CAS metadata (type system, type priorities, or FS
indexes)
+ * are invalid.
+ */
+ public static JCas getJCas(AnalysisEngine ae)
+ throws ResourceInitializationException {
+ if (jcas == null) {
+ jcas = ae.newJCas();
+ }
+ return jcas;
+ }
+
+ /**
+ * Serializes a CAS in the given format.
+ *
+ * @param type
+ * type of cTAKES (UIMA) serializer used to write CAS.
+ * @param prettyPrint
+ * {@code true} to do pretty printing of output.
+ * @param stream
+ * {@see OutputStream} object used to print out information
+ * extracted by using cTAKES.
+ * @throws SAXException
+ * if there was a SAX exception.
+ * @throws IOException
+ * if any I/O error occurs.
+ */
+ public static void serialize(CTAKESSerializer type, boolean prettyPrint,
+ OutputStream stream) throws SAXException, IOException {
+ if (type == CTAKESSerializer.XCAS) {
+ XCASSerializer.serialize(jcas.getCas(), stream,
prettyPrint);
+ } else if (type == CTAKESSerializer.XMI) {
+ XmiCasSerializer.serialize(jcas.getCas(),
jcas.getTypeSystem(),
+ stream, prettyPrint, new
XmiSerializationSharedData());
+ } else {
+ XmlCasSerializer.serialize(jcas.getCas(),
jcas.getTypeSystem(),
+ stream);
+ }
+ }
+
+ /**
+ * Returns the annotation value based on the given annotation type.
+ * @param annotation {@see IdentifiedAnnotation} object.
+ * @param property {@see CTAKESAnnotationProperty} enum used to
identify the annotation type.
+ * @return the annotation value.
+ */
+ public static String getAnnotationProperty(IdentifiedAnnotation
annotation,
+ CTAKESAnnotationProperty property) {
+ String value = null;
+ if (property == CTAKESAnnotationProperty.BEGIN) {
+ value = Integer.toString(annotation.getBegin());
+ } else if (property == CTAKESAnnotationProperty.END) {
+ value = Integer.toString(annotation.getEnd());
+ } else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
+ value = Boolean.toString(annotation.getConditional());
+ } else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
+ value = Float.toString(annotation.getConfidence());
+ } else if (property ==
CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
+ value =
Integer.toString(annotation.getDiscoveryTechnique());
+ } else if (property == CTAKESAnnotationProperty.GENERIC) {
+ value = Boolean.toString(annotation.getGeneric());
+ } else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
+ value = Integer.toString(annotation.getHistoryOf());
+ } else if (property == CTAKESAnnotationProperty.ID) {
+ value = Integer.toString(annotation.getId());
+ } else if (property ==
CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
+ FSArray mentions = annotation.getOntologyConceptArr();
+ StringBuilder sb = new StringBuilder();
+ if (mentions != null) {
+ for (int i = 0; i < mentions.size(); i++) {
+ if (mentions.get(i) instanceof
UmlsConcept) {
+ UmlsConcept concept =
(UmlsConcept) mentions.get(i);
+ sb.append(concept.getCui());
+ if (i < mentions.size()-1) {
+ sb.append(",");
+ }
+ }
+ }
+ }
+ value = sb.toString();
+ } else if (property == CTAKESAnnotationProperty.POLARITY) {
+ value = Integer.toString(annotation.getPolarity());
+ }
+ return value;
+ }
+
+ /**
+ * Resets cTAKES objects, if created. This method ensures that new
cTAKES
+ * objects (a.k.a., Analysis Engine and JCas) will be created if
getters of
+ * this class are called.
+ */
+ public static void reset() {
+ // Analysis Engine
+ ae.destroy();
+ ae = null;
+
+ // JCas
+ jcas.reset();
+ jcas = null;
+ }
+}
Modified:
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1683968&r1=1683967&r2=1683968&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
(original)
+++
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Sat Jun 6 23:29:52 2015
@@ -65,3 +65,4 @@ org.apache.tika.parser.isatab.ISArchiveP
org.apache.tika.parser.geoinfo.GeographicInformationParser
org.apache.tika.parser.geo.topic.GeoParser
org.apache.tika.parser.external.CompositeExternalParser
+org.apache.tika.parser.ctakes.CTAKESParser
\ No newline at end of file