Author: mattmann
Date: Sat Jun  6 23:29:52 2015
New Revision: 1683968

URL: http://svn.apache.org/r1683968
Log:
Fix for TIKA-1645 & TIKA-1642: Extraction of biomedical information using 
CTAKESParser contributed by Selina Chu, Giuseppe Totaro and mattmann.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-bundle/pom.xml
    tika/trunk/tika-parsers/pom.xml
    
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1683968&r1=1683967&r2=1683968&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Jun  6 23:29:52 2015
@@ -1,5 +1,9 @@
 Release 1.9 - 6/6/2015
 
+  * The ability to use the cTAKES clinical text
+    knowledge extraction system for biomedical data is 
+    now included as a Tika parser (TIKA-1645, TIKA-1642).
+
   * Tika-server allows a user to specify the Tika config
     from the command line (TIKA-1652, TIKA-1426).
 

Modified: tika/trunk/tika-bundle/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1683968&r1=1683967&r2=1683968&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Sat Jun  6 23:29:52 2015
@@ -151,7 +151,9 @@
               org.apache.tika.parser.*,
             </Export-Package>
             <Import-Package>
-              !org.junit,
+              !org.junit, 
+              !org.apache.ctakes.*,
+              !org.apache.uima.*,
               *,
               org.apache.tika.fork,
               android.util;resolution:=optional,

Modified: tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1683968&r1=1683967&r2=1683968&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Sat Jun  6 23:29:52 2015
@@ -326,6 +326,13 @@
                <artifactId>geoapi</artifactId>
                <version>3.0.0</version>
        </dependency>
+    <!-- Apache cTAKES -->
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-core</artifactId>
+      <version>3.2.2</version>
+      <scope>provided</scope>
+    </dependency>
   </dependencies>
 
   <build>

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java?rev=1683968&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
 Sat Jun  6 23:29:52 2015
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+
+/**
+ * This enumeration includes the properties that an {@see 
IdentifiedAnnotation} object can provide.
+ *
+ */
+public enum CTAKESAnnotationProperty {
+       BEGIN("start"),
+       END("end"),
+       CONDITIONAL("conditional"),
+       CONFIDENCE("confidence"),
+       DISCOVERY_TECNIQUE("discoveryTechnique"),
+       GENERIC("generic"),
+       HISTORY_OF("historyOf"),
+       ID("id"),
+       ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"),
+       POLARITY("polarity");
+       
+       private String name;
+       
+       CTAKESAnnotationProperty(String name) {
+               this.name = name;
+       }
+       
+       public String getName() {
+               return name;
+       }
+}
\ No newline at end of file

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java?rev=1683968&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
 Sat Jun  6 23:29:52 2015
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.util.Properties;
+
+import org.apache.tika.io.NullOutputStream;
+
+/*
+ * Configuration for {@see CTAKESContentHandler}.
+ * 
+ * This class allows to enable cTAKES and set its parameters.
+ * 
+ */
+public class CTAKESConfig implements Serializable {
+       /**
+        * Serial version UID
+        */
+       private static final long serialVersionUID = -1599741171775528923L;
+       
+       // Path to XML descriptor for AnalysisEngine
+       private String aeDescriptorPath = 
"/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml";
+       
+       // UMLS username
+       private String UMLSUser = "";
+       
+       // UMLS password
+       private String UMLSPass = "";
+       
+       // Enables formatted output
+       private boolean prettyPrint = true; 
+       
+       // Type of cTAKES (UIMA) serializer
+       private CTAKESSerializer serializerType = CTAKESSerializer.XMI;
+       
+       // OutputStream object used for CAS serialization
+       private OutputStream stream = NullOutputStream.NULL_OUTPUT_STREAM;
+       
+       // Enables CAS serialization
+       private boolean serialize = false;
+       
+       // Enables text analysis using cTAKES
+       private boolean text = true;
+       
+       // List of metadata to analyze using cTAKES
+       private String[] metadata = null;
+       
+       // List of annotation properties to add to metadata in addition to text 
covered by an annotation
+       private CTAKESAnnotationProperty[] annotationProps = null;
+       
+       // Character used to separate the annotation properties into metadata
+       private char separatorChar = ':';
+
+       /**
+        * Default constructor.
+        */
+       public CTAKESConfig() {
+               
init(this.getClass().getResourceAsStream("CTAKESConfig.properties"));
+       }
+       
+       /**
+        * Loads properties from InputStream and then tries to close 
InputStream.
+        * @param stream {@see InputStream} object used to read properties.
+        */
+       public CTAKESConfig(InputStream stream) {
+               init(stream);
+       }
+       
+       private void init(InputStream stream) {
+               if (stream == null) {
+                       return;
+               }
+               Properties props = new Properties();
+               
+               try {
+                       props.load(stream);
+               } catch (IOException e) {
+                       // TODO warning
+               } finally {
+                       if (stream != null) {
+                               try {
+                                       stream.close();
+                               } catch (IOException ioe) {
+                                       // TODO warning
+                               }
+                       }
+               }
+               
+               setAeDescriptorPath(props.getProperty("aeDescriptorPath", 
getAeDescriptorPath()));
+               setUMLSUser(props.getProperty("UMLSUser", getUMLSUser()));
+               setUMLSPass(props.getProperty("UMLSPass", getUMLSPass()));
+               setText(Boolean.valueOf(props.getProperty("text", 
Boolean.toString(isText()))));
+               setMetadata(props.getProperty("metadata", 
getMetadataAsString()).split(","));
+               setAnnotationProps(props.getProperty("annotationProps", 
getAnnotationPropsAsString()).split(","));
+               setSeparatorChar(props.getProperty("separatorChar", 
Character.toString(getSeparatorChar())).charAt(0));
+       }
+       
+       /**
+        * Returns the path to XML descriptor for AnalysisEngine.
+        * @return the path to XML descriptor for AnalysisEngine.
+        */
+       public String getAeDescriptorPath() {
+               return aeDescriptorPath;
+       }
+       
+       /**
+        * Returns the UMLS username.
+        * @return the UMLS username.
+        */
+       public String getUMLSUser() {
+               return UMLSUser;
+       }
+       
+       /**
+        * Returns the UMLS password.
+        * @return the UMLS password.
+        */
+       public String getUMLSPass() {
+               return UMLSPass;
+       }
+       
+       /**
+        * Returns {@code true} if formatted output is enabled, {@code false} 
otherwise.
+        * @return {@code true} if formatted output is enabled, {@code false} 
otherwise.
+        */
+       public boolean isPrettyPrint() {
+               return prettyPrint;
+       }
+       
+       /**
+        * Returns the type of cTAKES (UIMA) serializer used to write the CAS.
+        * @return the type of cTAKES serializer.
+        */
+       public CTAKESSerializer getSerializerType() {
+               return serializerType;
+       }
+       
+       /**
+        * Returns an {@see OutputStream} object used write the CAS.
+        * @return {@see OutputStream} object used write the CAS.
+        */
+       public OutputStream getOutputStream() {
+               return stream;
+       }
+       
+       /**
+        * Returns {@code true} if CAS serialization is enabled, {@code false} 
otherwise.
+        * @return {@code true} if CAS serialization output is enabled, {@code 
false} otherwise.
+        */
+       public boolean isSerialize() {
+               return serialize;
+       }
+       
+       /**
+        * Returns {@code true} if content text analysis is enabled {@code 
false} otherwise.
+        * @return {@code true} if content text analysis is enabled {@code 
false} otherwise.
+        */
+       public boolean isText() {
+               return text;
+       }
+       
+       /**
+        * Returns an array of metadata whose values will be analyzed using 
cTAKES.
+        * @return an array of metadata whose values will be analyzed using 
cTAKES.
+        */
+       public String[] getMetadata() {
+               return metadata;
+       }
+       
+       /**
+        * Returns a string containing a comma-separated list of metadata whose 
values will be analyzed using cTAKES.
+        * @return a string containing a comma-separated list of metadata whose 
values will be analyzed using cTAKES.
+        */
+       public String getMetadataAsString() {
+               if (metadata == null) {
+                       return "";
+               }
+               StringBuilder sb = new StringBuilder();
+               for (int i = 0; i < metadata.length; i++) {
+                       sb.append(metadata[i]);
+                       if (i < metadata.length-1) {
+                               sb.append(",");
+                       }
+               }
+               return sb.toString();
+       }
+       
+       /**
+        * Returns an array of {@see CTAKESAnnotationProperty}'s that will be 
included into cTAKES metadata.
+        * @return an array of {@see CTAKESAnnotationProperty}'s that will be 
included into cTAKES metadata.
+        */
+       public CTAKESAnnotationProperty[] getAnnotationProps() {
+               return annotationProps;
+       }
+       
+       /**
+        * Returns a string containing a comma-separated list of {@see 
CTAKESAnnotationProperty} names that will be included into cTAKES metadata.
+        * @return
+        */
+       public String getAnnotationPropsAsString() {
+               StringBuilder sb = new StringBuilder();
+               sb.append("coveredText");
+               if (annotationProps != null) {
+                       for (CTAKESAnnotationProperty property : 
annotationProps) {
+                               sb.append(separatorChar);
+                               sb.append(property.getName());
+                       }
+               }
+               return sb.toString();
+       }
+       
+       /**
+        * Returns the separator character used for annotation properties.
+        * @return the separator character used for annotation properties.
+        */
+       public char getSeparatorChar() {
+               return separatorChar;
+       }
+
+       /**
+        * Sets the path to XML descriptor for AnalysisEngine.
+        * @param aeDescriptorPath the path to XML descriptor for 
AnalysisEngine.
+        */
+       public void setAeDescriptorPath(String aeDescriptorPath) {
+               this.aeDescriptorPath = aeDescriptorPath;
+       }
+
+       /**
+        * Sets the UMLS username.
+        * @param uMLSUser the UMLS username.
+        */
+       public void setUMLSUser(String uMLSUser) {
+               this.UMLSUser = uMLSUser;
+       }
+
+       /**
+        * Sets the UMLS password.
+        * @param uMLSPass the UMLS password.
+        */
+       public void setUMLSPass(String uMLSPass) {
+               this.UMLSPass = uMLSPass;
+       }
+
+       /**
+        * Enables the formatted output for serializer.
+        * @param prettyPrint {@true} to enable formatted output, {@code false} 
otherwise.
+        */
+       public void setPrettyPrint(boolean prettyPrint) {
+               this.prettyPrint = prettyPrint;
+       }
+
+       /**
+        * Sets the type of cTAKES (UIMA) serializer used to write CAS. 
+        * @param serializerType the type of cTAKES serializer.
+        */
+       public void setSerializerType(CTAKESSerializer serializerType) {
+               this.serializerType = serializerType;
+       }
+       
+       /**
+        * Sets the {@see OutputStream} object used to write the CAS.
+        * @param stream the {@see OutputStream} object used to write the CAS.
+        */
+       public void setOutputStream(OutputStream stream) {
+               this.stream = stream;
+       }
+       
+       /**
+        * Enables CAS serialization.
+        * @param serialize {@true} to enable CAS serialization, {@code false} 
otherwise.
+        */
+       public void setSerialize(boolean serialize) {
+               this.serialize = serialize;
+       }
+       
+       /**
+        * Enables content text analysis using cTAKES.
+        * @param text {@true} to enable content text analysis, {@code false} 
otherwise.
+        */
+       public void setText(boolean text) {
+               this.text = text;
+       }
+       
+       /**
+        * Sets the metadata whose values will be analyzed using cTAKES.
+        * @param metadata the metadata whose values will be analyzed using 
cTAKES.
+        */
+       public void setMetadata(String[] metadata) {
+               this.metadata = metadata;
+       }
+       
+       /**
+        * Sets the {@see CTAKESAnnotationProperty}'s that will be included 
into cTAKES metadata.
+        * @param annotationProps the {@see CTAKESAnnotationProperty}'s that 
will be included into cTAKES metadata.
+        */
+       public void setAnnotationProps(CTAKESAnnotationProperty[] 
annotationProps) {
+               this.annotationProps = annotationProps;
+       }
+       
+       /**
+        * ets the {@see CTAKESAnnotationProperty}'s that will be included into 
cTAKES metadata.
+        * @param annotationProps the {@see CTAKESAnnotationProperty}'s that 
will be included into cTAKES metadata.
+        */
+       public void setAnnotationProps(String[] annotationProps) {
+               CTAKESAnnotationProperty[] properties = new 
CTAKESAnnotationProperty[annotationProps.length];
+               for (int i = 0; i < annotationProps.length; i++) {
+                       properties[i] = 
CTAKESAnnotationProperty.valueOf(annotationProps[i]);
+               }
+               setAnnotationProps(properties);
+       }
+       
+       /**
+        * Sets the separator character used for annotation properties.
+        * @param separatorChar the separator character used for annotation 
properties.
+        */
+       public void setSeparatorChar(char separatorChar) {
+               this.separatorChar = separatorChar;
+       }
+}
\ No newline at end of file

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java?rev=1683968&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
 Sat Jun  6 23:29:52 2015
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Class used to extract biomedical information while parsing. 
+ *
+ * <p>
+ * This class relies on <a href="http://ctakes.apache.org/";>Apache cTAKES</a> 
+ * that is a natural language processing system for extraction of information 
+ * from electronic medical record clinical free-text.
+ * </p>
+ *
+ */
+public class CTAKESContentHandler extends ContentHandlerDecorator {
+       // Prefix used for metadata including cTAKES annotations
+       public static String CTAKES_META_PREFIX = "ctakes:";
+       
+       // Configuration object for CTAKESContentHandler
+       private CTAKESConfig config = null;
+       
+       // StringBuilder object used to build the clinical free-text for cTAKES
+       private StringBuilder sb = null;
+       
+       // Metadata object used for cTAKES annotations
+       private Metadata metadata = null;
+       
+       /**
+        * Creates a new {@see CTAKESContentHandler} for the given {@see 
ContentHandler} and Metadata objects. 
+        * @param handler the {@see ContentHandler} object to be decorated.
+        * @param metadata the {@see Metadata} object that will be populated 
using biomedical information extracted by cTAKES.
+        * @param config the {@see CTAKESConfig} object used to configure the 
handler.
+        */
+       public CTAKESContentHandler(ContentHandler handler, Metadata metadata, 
CTAKESConfig config) {
+               super(handler);
+               this.metadata = metadata;
+               this.config = config;
+               this.sb = new StringBuilder();
+       }
+       
+       /**
+        * Creates a new {@see CTAKESContentHandler} for the given {@see 
ContentHandler} and Metadata objects.
+        * @param handler the {@see ContentHandler} object to be decorated.
+        * @param metadata the {@see Metadata} object that will be populated 
using biomedical information extracted by cTAKES.
+        */
+       public CTAKESContentHandler(ContentHandler handler, Metadata metadata) {
+               this(handler, metadata, new CTAKESConfig());
+       }
+       
+       /**
+        * Default constructor.
+        */
+       public CTAKESContentHandler() {
+               this(new DefaultHandler(), new Metadata());
+       }
+       
+       @Override
+       public void characters(char[] ch, int start, int length) throws 
SAXException {
+               if (config.isText()) {
+                       sb.append(ch, start, length);
+               }
+               super.characters(ch, start, length);
+       }
+
+       @Override
+       public void endDocument() throws SAXException {
+               try {
+                       // create an Analysis Engine
+                       AnalysisEngine ae = 
CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(), 
config.getUMLSUser(), config.getUMLSPass());
+                       
+                       // create a JCas, given an AE
+                       JCas jcas = CTAKESUtils.getJCas(ae);
+                       
+                       StringBuilder metaText = new StringBuilder();
+                       for (String name : config.getMetadata()) {
+                               for (String value : metadata.getValues(name)) {
+                                       metaText.append(value);
+                                       metaText.append(System.lineSeparator());
+                               }
+                       }
+                       
+                       // analyze text
+                       jcas.setDocumentText(metaText.toString() + 
sb.toString());
+                       ae.process(jcas);
+                       
+                       // add annotations to metadata
+                       metadata.add(CTAKES_META_PREFIX + "schema", 
config.getAnnotationPropsAsString());
+                       CTAKESAnnotationProperty[] annotationPros = 
config.getAnnotationProps();
+                       Collection<IdentifiedAnnotation> collection = 
JCasUtil.select(jcas, IdentifiedAnnotation.class);
+                       Iterator<IdentifiedAnnotation> iterator = 
collection.iterator();
+                       while (iterator.hasNext()) {
+                               IdentifiedAnnotation annotation = 
iterator.next();
+                               StringBuilder annotationBuilder = new 
StringBuilder();
+                               
annotationBuilder.append(annotation.getCoveredText());
+                               if (annotationPros != null) {
+                                       for (CTAKESAnnotationProperty property 
: annotationPros) {
+                                               
annotationBuilder.append(config.getSeparatorChar());
+                                               
annotationBuilder.append(CTAKESUtils.getAnnotationProperty(annotation, 
property));
+                                       }
+                               }
+                               metadata.add(CTAKES_META_PREFIX + 
annotation.getType().getShortName(), annotationBuilder.toString());
+                       }
+                       
+                       if (config.isSerialize()) {
+                               // serialize data
+                               
CTAKESUtils.serialize(config.getSerializerType(), config.isPrettyPrint(), 
config.getOutputStream());
+                       }
+               } catch (Exception e) {
+                       e.printStackTrace();
+                       throw new SAXException(e.getMessage());
+               }
+       }
+       
+       /**
+        * Returns metadata that includes cTAKES annotations.
+        * @return {@Metadata} object that includes cTAKES annotations.
+        */
+       public Metadata getMetadata() {
+               return metadata;
+       }
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1683968&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
 Sat Jun  6 23:29:52 2015
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ParserDecorator;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * CTAKESParser decorates {@see AutoDetectParser} and leverages on {@see
+ * CTAKESContentHandler} to extract biomedical information from clinical text 
using Apache cTAKES.
+ * 
+ */
+public class CTAKESParser extends ParserDecorator {
+       /**
+        * Serial version UID
+        */
+       private static final long serialVersionUID = -2313482748027097961L;
+
+       /**
+        * Default constructor.
+        */
+       public CTAKESParser() {
+               super(new AutoDetectParser());
+       }
+
+       @Override
+       public void parse(InputStream stream, ContentHandler handler,
+                       Metadata metadata, ParseContext context) throws 
IOException,
+                       SAXException, TikaException {
+               CTAKESConfig config = context.get(CTAKESConfig.class,
+                               new CTAKESConfig());
+               CTAKESContentHandler ctakesHandler = new 
CTAKESContentHandler(handler,
+                               metadata, config);
+               super.parse(stream, ctakesHandler, metadata, context);
+       }
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java?rev=1683968&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
 Sat Jun  6 23:29:52 2015
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.util.XmlCasSerializer;
+
+/**
+ * Enumeration for types of cTAKES (UIMA) CAS serializer supported by cTAKES.
+ * 
+ * A CAS serializer writes a CAS in the given format.
+ *
+ */
+public enum CTAKESSerializer {
+       XCAS(XCASSerializer.class.getName()),
+       XMI(XmiCasSerializer.class.getName()),
+       XML(XmlCasSerializer.class.getName());
+       
+       private final String className;
+       
+       private CTAKESSerializer(String className) {
+               this.className = className;
+       }
+       
+       public String getClassName() {
+               return className;
+       }
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java?rev=1683968&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
 Sat Jun  6 23:29:52 2015
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URISyntaxException;
+
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.cas.impl.XmiSerializationSharedData;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XmlCasSerializer;
+import org.xml.sax.SAXException;
+
+/**
+ * This class provides methods to extract biomedical information from plain 
text
+ * using {@see CTAKESContentHandler} that relies on Apache cTAKES.
+ * 
+ * <p>
+ * Apache cTAKES is built on top of <a href="https://uima.apache.org/";>Apache
+ * UIMA</a> framework and <a href="https://opennlp.apache.org/";>OpenNLP</a>
+ * toolkit.
+ * </p>
+ *
+ */
+public class CTAKESUtils {
+       // UIMA Analysis Engine
+       private static AnalysisEngine ae = null;
+
+       // JCas object for working with the CAS (Common Analysis System)
+       private static JCas jcas = null;
+
+       // UMLS username property
+       private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
+
+       // UMLS password property
+       private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
+
+       /**
+        * Returns a new UIMA Analysis Engine (AE). This method ensures that 
only
+        * one instance of an AE is created.
+        * 
+        * <p>
+        * An Analysis Engine is a component responsible for analyzing 
unstructured
+        * information, discovering and representing semantic content. 
Unstructured
+        * information includes, but is not restricted to, text documents.
+        * </p>
+        * 
+        * @param aeDescriptor
+        *            pathname for XML file including an 
AnalysisEngineDescription
+        *            that contains all of the information needed to 
instantiate and
+        *            use an AnalysisEngine.
+        * @param umlsUser
+        *            UMLS username for NLM database
+        * @param umlsPass
+        *            UMLS password for NLM database
+        * @return an Analysis Engine for analyzing unstructured information.
+        * @throws IOException
+        *             if any I/O error occurs.
+        * @throws InvalidXMLException
+        *             if the input XML is not valid or does not specify a valid
+        *             ResourceSpecifier.
+        * @throws ResourceInitializationException
+        *             if a failure occurred during production of the resource.
+        * @throws URISyntaxException
+        *             if URL of the resource is not formatted strictly 
according to
+        *             to RFC2396 and cannot be converted to a URI.
+        */
+       public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
+                       String umlsUser, String umlsPass) throws IOException,
+                       InvalidXMLException, ResourceInitializationException,
+                       URISyntaxException {
+               if (ae == null) {
+                       // UMLS user ID and password.
+                       String aeDescriptorPath = CTAKESUtils.class
+                                       
.getResource(aeDescriptor).toURI().getPath();
+
+                       // get Resource Specifier from XML
+                       XMLInputSource aeIputSource = new 
XMLInputSource(aeDescriptorPath);
+                       ResourceSpecifier aeSpecifier = 
UIMAFramework.getXMLParser()
+                                       .parseResourceSpecifier(aeIputSource);
+
+                       // UMLS user ID and password
+                       if ((umlsUser != null) && (!umlsUser.isEmpty())
+                                       && (umlsPass != null) && 
(!umlsPass.isEmpty())) {
+                               /*
+                                * It is highly recommended that you change 
UMLS credentials in
+                                * the XML configuration file instead of giving 
user and
+                                * password using CTAKESConfig.
+                                */
+                               System.setProperty(CTAKES_UMLS_USER, umlsUser);
+                               System.setProperty(CTAKES_UMLS_PASS, umlsPass);
+                       }
+
+                       // create AE
+                       ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
+               }
+               return ae;
+       }
+
+       /**
+        * Returns a new JCas () appropriate for the given Analysis Engine. This
+        * method ensures that only one instance of a JCas is created. A Jcas 
is a
+        * Java Cover Classes based Object-oriented CAS (Common Analysis System)
+        * API.
+        * 
+        * <p>
+        * Important: It is highly recommended that you reuse CAS objects rather
+        * than creating new CAS objects prior to each analysis. This is 
because CAS
+        * objects may be expensive to create and may consume a significant 
amount
+        * of memory.
+        * </p>
+        * 
+        * @param ae
+        *            AnalysisEngine used to create an appropriate JCas object.
+        * @return a JCas object appropriate for the given AnalysisEngine.
+        * @throws ResourceInitializationException
+        *             if a CAS could not be created because this 
AnalysisEngine's
+        *             CAS metadata (type system, type priorities, or FS 
indexes)
+        *             are invalid.
+        */
+       public static JCas getJCas(AnalysisEngine ae)
+                       throws ResourceInitializationException {
+               if (jcas == null) {
+                       jcas = ae.newJCas();
+               }
+               return jcas;
+       }
+
+       /**
+        * Serializes a CAS in the given format.
+        * 
+        * @param type
+        *            type of cTAKES (UIMA) serializer used to write CAS.
+        * @param prettyPrint
+        *            {@code true} to do pretty printing of output.
+        * @param stream
+        *            {@see OutputStream} object used to print out information
+        *            extracted by using cTAKES.
+        * @throws SAXException
+        *             if there was a SAX exception.
+        * @throws IOException
+        *             if any I/O error occurs.
+        */
+       public static void serialize(CTAKESSerializer type, boolean prettyPrint,
+                       OutputStream stream) throws SAXException, IOException {
+               if (type == CTAKESSerializer.XCAS) {
+                       XCASSerializer.serialize(jcas.getCas(), stream, 
prettyPrint);
+               } else if (type == CTAKESSerializer.XMI) {
+                       XmiCasSerializer.serialize(jcas.getCas(), 
jcas.getTypeSystem(),
+                                       stream, prettyPrint, new 
XmiSerializationSharedData());
+               } else {
+                       XmlCasSerializer.serialize(jcas.getCas(), 
jcas.getTypeSystem(),
+                                       stream);
+               }
+       }
+
+       /**
+        * Returns the annotation value based on the given annotation type. 
+        * @param annotation {@see IdentifiedAnnotation} object. 
+        * @param property {@see CTAKESAnnotationProperty} enum used to 
identify the annotation type.
+        * @return the annotation value.
+        */
+       public static String getAnnotationProperty(IdentifiedAnnotation 
annotation,
+                       CTAKESAnnotationProperty property) {
+               String value = null;
+               if (property == CTAKESAnnotationProperty.BEGIN) {
+                       value = Integer.toString(annotation.getBegin());
+               } else if (property == CTAKESAnnotationProperty.END) {
+                       value = Integer.toString(annotation.getEnd());
+               } else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
+                       value = Boolean.toString(annotation.getConditional());
+               } else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
+                       value = Float.toString(annotation.getConfidence());
+               } else if (property == 
CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
+                       value = 
Integer.toString(annotation.getDiscoveryTechnique());
+               } else if (property == CTAKESAnnotationProperty.GENERIC) {
+                       value = Boolean.toString(annotation.getGeneric());
+               } else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
+                       value = Integer.toString(annotation.getHistoryOf());
+               } else if (property == CTAKESAnnotationProperty.ID) {
+                       value = Integer.toString(annotation.getId());
+               } else if (property == 
CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
+                       FSArray mentions = annotation.getOntologyConceptArr();
+                       StringBuilder sb = new StringBuilder();
+                       if (mentions != null) {
+                               for (int i = 0; i < mentions.size(); i++) {
+                                       if (mentions.get(i) instanceof 
UmlsConcept) {
+                                               UmlsConcept concept = 
(UmlsConcept) mentions.get(i);
+                                               sb.append(concept.getCui());
+                                               if (i < mentions.size()-1) {
+                                                       sb.append(",");
+                                               }
+                                       }
+                               }
+                       }
+                       value = sb.toString();
+               } else if (property == CTAKESAnnotationProperty.POLARITY) {
+                       value = Integer.toString(annotation.getPolarity());
+               }
+               return value;
+       }
+
+       /**
+        * Resets cTAKES objects, if created. This method ensures that new 
cTAKES
+        * objects (a.k.a., Analysis Engine and JCas) will be created if 
getters of
+        * this class are called.
+        */
+       public static void reset() {
+               // Analysis Engine
+               ae.destroy();
+               ae = null;
+
+               // JCas
+               jcas.reset();
+               jcas = null;
+       }
+}

Modified: 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1683968&r1=1683967&r2=1683968&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (original)
+++ 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Sat Jun  6 23:29:52 2015
@@ -65,3 +65,4 @@ org.apache.tika.parser.isatab.ISArchiveP
 org.apache.tika.parser.geoinfo.GeographicInformationParser
 org.apache.tika.parser.geo.topic.GeoParser
 org.apache.tika.parser.external.CompositeExternalParser
+org.apache.tika.parser.ctakes.CTAKESParser
\ No newline at end of file


Reply via email to