Author: totaro
Date: Fri Jun 19 23:50:14 2015
New Revision: 1686518
URL: http://svn.apache.org/r1686518
Log:
TIKA-1654: Reset cTAKES CAS into CTAKESParser
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java?rev=1686518&r1=1686517&r2=1686518&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
Fri Jun 19 23:50:14 2015
@@ -30,72 +30,95 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
- * Class used to extract biomedical information while parsing.
+ * Class used to extract biomedical information while parsing.
*
* <p>
- * This class relies on <a href="http://ctakes.apache.org/">Apache cTAKES</a>
- * that is a natural language processing system for extraction of information
+ * This class relies on <a href="http://ctakes.apache.org/">Apache cTAKES</a>
+ * that is a natural language processing system for extraction of information
* from electronic medical record clinical free-text.
* </p>
*/
public class CTAKESContentHandler extends ContentHandlerDecorator {
- // Prefix used for metadata including cTAKES annotations
- public static String CTAKES_META_PREFIX = "ctakes:";
+ // Prefix used for metadata including cTAKES annotations
+ public static String CTAKES_META_PREFIX = "ctakes:";
- // Configuration object for CTAKESContentHandler
- private CTAKESConfig config = null;
+ // Configuration object for CTAKESContentHandler
+ private CTAKESConfig config = null;
- // StringBuilder object used to build the clinical free-text for cTAKES
- private StringBuilder sb = null;
+ // StringBuilder object used to build the clinical free-text for cTAKES
+ private StringBuilder sb = null;
- // Metadata object used for cTAKES annotations
- private Metadata metadata = null;
-
- /**
- * Creates a new {@see CTAKESContentHandler} for the given {@see
ContentHandler} and Metadata objects.
- * @param handler the {@see ContentHandler} object to be decorated.
- * @param metadata the {@see Metadata} object that will be populated using
biomedical information extracted by cTAKES.
- * @param config the {@see CTAKESConfig} object used to configure the
handler.
- */
- public CTAKESContentHandler(ContentHandler handler, Metadata metadata,
CTAKESConfig config) {
- super(handler);
- this.metadata = metadata;
- this.config = config;
- this.sb = new StringBuilder();
- }
-
- /**
- * Creates a new {@see CTAKESContentHandler} for the given {@see
ContentHandler} and Metadata objects.
- * @param handler the {@see ContentHandler} object to be decorated.
- * @param metadata the {@see Metadata} object that will be populated using
biomedical information extracted by cTAKES.
- */
- public CTAKESContentHandler(ContentHandler handler, Metadata metadata) {
- this(handler, metadata, new CTAKESConfig());
- }
-
- /**
- * Default constructor.
- */
- public CTAKESContentHandler() {
- this(new DefaultHandler(), new Metadata());
- }
+ // Metadata object used for cTAKES annotations
+ private Metadata metadata = null;
+
+ // UIMA Analysis Engine
+ private AnalysisEngine ae = null;
+
+ // JCas object for working with the CAS (Common Analysis System)
+ private JCas jcas = null;
+
+ /**
+ * Creates a new {@see CTAKESContentHandler} for the given {@see
+ * ContentHandler} and Metadata objects.
+ *
+ * @param handler
+ * the {@see ContentHandler} object to be decorated.
+ * @param metadata
+ * the {@see Metadata} object that will be populated using
+ * biomedical information extracted by cTAKES.
+ * @param config
+ * the {@see CTAKESConfig} object used to configure the
handler.
+ */
+ public CTAKESContentHandler(ContentHandler handler, Metadata metadata,
+ CTAKESConfig config) {
+ super(handler);
+ this.metadata = metadata;
+ this.config = config;
+ this.sb = new StringBuilder();
+ }
+
+ /**
+ * Creates a new {@see CTAKESContentHandler} for the given {@see
+ * ContentHandler} and Metadata objects.
+ *
+ * @param handler
+ * the {@see ContentHandler} object to be decorated.
+ * @param metadata
+ * the {@see Metadata} object that will be populated using
+ * biomedical information extracted by cTAKES.
+ */
+ public CTAKESContentHandler(ContentHandler handler, Metadata metadata) {
+ this(handler, metadata, new CTAKESConfig());
+ }
+
+ /**
+ * Default constructor.
+ */
+ public CTAKESContentHandler() {
+ this(new DefaultHandler(), new Metadata());
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (config.isText()) {
+ sb.append(ch, start, length);
+ }
+ super.characters(ch, start, length);
+ }
- @Override
- public void characters(char[] ch, int start, int length) throws
SAXException {
- if (config.isText()) {
- sb.append(ch, start, length);
- }
- super.characters(ch, start, length);
- }
-
- @Override
+ @Override
public void endDocument() throws SAXException {
try {
// create an Analysis Engine
- AnalysisEngine ae =
CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(),
config.getUMLSUser(), config.getUMLSPass());
+ if (ae == null) {
+ ae =
CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(),
config.getUMLSUser(), config.getUMLSPass());
+ }
// create a JCas, given an AE
- JCas jcas = CTAKESUtils.getJCas(ae);
+ if (jcas == null) {
+ jcas = CTAKESUtils.getJCas(ae);
+ }
// get metadata to process
StringBuilder metaText = new StringBuilder();
@@ -133,20 +156,21 @@ public class CTAKESContentHandler extend
if (config.isSerialize()) {
// serialize data
- CTAKESUtils.serialize(config.getSerializerType(),
config.isPrettyPrint(), config.getOutputStream());
+ CTAKESUtils.serialize(jcas, config.getSerializerType(),
config.isPrettyPrint(), config.getOutputStream());
}
} catch (Exception e) {
throw new SAXException(e.getMessage());
} finally {
- CTAKESUtils.resetCAS();
+ CTAKESUtils.resetCAS(jcas);
}
}
- /**
- * Returns metadata that includes cTAKES annotations.
- * @return {@Metadata} object that includes cTAKES annotations.
- */
- public Metadata getMetadata() {
- return metadata;
- }
+ /**
+ * Returns metadata that includes cTAKES annotations.
+ *
+ * @return {@Metadata} object that includes cTAKES annotations.
+ */
+ public Metadata getMetadata() {
+ return metadata;
+ }
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1686518&r1=1686517&r2=1686518&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
Fri Jun 19 23:50:14 2015
@@ -85,7 +85,7 @@ public class CTAKESParser extends Parser
super.parse(stream, ctakesHandler, metadata, context);
}
- @Override
+ //@Override
public String getDecorationName() {
return "CTakes";
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java?rev=1686518&r1=1686517&r2=1686518&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
Fri Jun 19 23:50:14 2015
@@ -47,220 +47,219 @@ import org.xml.sax.SAXException;
* </p>
*/
public class CTAKESUtils {
- // UIMA Analysis Engine
- private static AnalysisEngine ae = null;
+ // UMLS username property
+ private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
- // JCas object for working with the CAS (Common Analysis System)
- private static JCas jcas = null;
+ // UMLS password property
+ private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
- // UMLS username property
- private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
-
- // UMLS password property
- private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
-
- /**
- * Returns a new UIMA Analysis Engine (AE). This method ensures that only
- * one instance of an AE is created.
- *
- * <p>
- * An Analysis Engine is a component responsible for analyzing unstructured
- * information, discovering and representing semantic content. Unstructured
- * information includes, but is not restricted to, text documents.
- * </p>
- *
- * @param aeDescriptor
- * pathname for XML file including an AnalysisEngineDescription
- * that contains all of the information needed to instantiate
and
- * use an AnalysisEngine.
- * @param umlsUser
- * UMLS username for NLM database
- * @param umlsPass
- * UMLS password for NLM database
- * @return an Analysis Engine for analyzing unstructured information.
- * @throws IOException
- * if any I/O error occurs.
- * @throws InvalidXMLException
- * if the input XML is not valid or does not specify a valid
- * ResourceSpecifier.
- * @throws ResourceInitializationException
- * if a failure occurred during production of the resource.
- * @throws URISyntaxException
- * if URL of the resource is not formatted strictly according
to
- * to RFC2396 and cannot be converted to a URI.
- */
- public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
- String umlsUser, String umlsPass) throws IOException,
- InvalidXMLException, ResourceInitializationException,
- URISyntaxException {
- if (ae == null) {
- // UMLS user ID and password.
- String aeDescriptorPath = CTAKESUtils.class
- .getResource(aeDescriptor).toURI().getPath();
-
- // get Resource Specifier from XML
- XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath);
- ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser()
- .parseResourceSpecifier(aeIputSource);
-
- // UMLS user ID and password
- if ((umlsUser != null) && (!umlsUser.isEmpty())
- && (umlsPass != null) && (!umlsPass.isEmpty())) {
- /*
- * It is highly recommended that you change UMLS credentials in
- * the XML configuration file instead of giving user and
- * password using CTAKESConfig.
- */
- System.setProperty(CTAKES_UMLS_USER, umlsUser);
- System.setProperty(CTAKES_UMLS_PASS, umlsPass);
- }
-
- // create AE
- ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
- }
- return ae;
- }
-
- /**
- * Returns a new JCas () appropriate for the given Analysis Engine. This
- * method ensures that only one instance of a JCas is created. A Jcas is a
- * Java Cover Classes based Object-oriented CAS (Common Analysis System)
- * API.
- *
- * <p>
- * Important: It is highly recommended that you reuse CAS objects rather
- * than creating new CAS objects prior to each analysis. This is because
CAS
- * objects may be expensive to create and may consume a significant amount
- * of memory.
- * </p>
- *
- * @param ae
- * AnalysisEngine used to create an appropriate JCas object.
- * @return a JCas object appropriate for the given AnalysisEngine.
- * @throws ResourceInitializationException
- * if a CAS could not be created because this AnalysisEngine's
- * CAS metadata (type system, type priorities, or FS indexes)
- * are invalid.
- */
- public static JCas getJCas(AnalysisEngine ae)
- throws ResourceInitializationException {
- if (jcas == null) {
- jcas = ae.newJCas();
- }
- return jcas;
- }
-
- /**
- * Serializes a CAS in the given format.
- *
- * @param type
- * type of cTAKES (UIMA) serializer used to write CAS.
- * @param prettyPrint
- * {@code true} to do pretty printing of output.
- * @param stream
- * {@see OutputStream} object used to print out information
- * extracted by using cTAKES.
- * @throws SAXException
- * if there was a SAX exception.
- * @throws IOException
- * if any I/O error occurs.
- */
- public static void serialize(CTAKESSerializer type, boolean prettyPrint,
- OutputStream stream) throws SAXException, IOException {
- if (type == CTAKESSerializer.XCAS) {
- XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint);
- } else if (type == CTAKESSerializer.XMI) {
- XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
- stream, prettyPrint, new XmiSerializationSharedData());
- } else {
- XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
- stream);
- }
- }
-
- /**
- * Returns the annotation value based on the given annotation type.
- * @param annotation {@see IdentifiedAnnotation} object.
- * @param property {@see CTAKESAnnotationProperty} enum used to identify
the annotation type.
- * @return the annotation value.
- */
- public static String getAnnotationProperty(IdentifiedAnnotation annotation,
- CTAKESAnnotationProperty property) {
- String value = null;
- if (property == CTAKESAnnotationProperty.BEGIN) {
- value = Integer.toString(annotation.getBegin());
- } else if (property == CTAKESAnnotationProperty.END) {
- value = Integer.toString(annotation.getEnd());
- } else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
- value = Boolean.toString(annotation.getConditional());
- } else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
- value = Float.toString(annotation.getConfidence());
- } else if (property == CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
- value = Integer.toString(annotation.getDiscoveryTechnique());
- } else if (property == CTAKESAnnotationProperty.GENERIC) {
- value = Boolean.toString(annotation.getGeneric());
- } else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
- value = Integer.toString(annotation.getHistoryOf());
- } else if (property == CTAKESAnnotationProperty.ID) {
- value = Integer.toString(annotation.getId());
- } else if (property == CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
- FSArray mentions = annotation.getOntologyConceptArr();
- StringBuilder sb = new StringBuilder();
- if (mentions != null) {
- for (int i = 0; i < mentions.size(); i++) {
- if (mentions.get(i) instanceof UmlsConcept) {
- UmlsConcept concept = (UmlsConcept) mentions.get(i);
- sb.append(concept.getCui());
- if (i < mentions.size()-1) {
- sb.append(",");
- }
- }
- }
- }
- value = sb.toString();
- } else if (property == CTAKESAnnotationProperty.POLARITY) {
- value = Integer.toString(annotation.getPolarity());
- }
- return value;
- }
-
- /**
- * Resets cTAKES objects, if created. This method ensures that new cTAKES
- * objects (a.k.a., Analysis Engine and JCas) will be created if getters of
- * this class are called.
- */
- public static void reset() {
- // Analysis Engine
- ae.destroy();
- ae = null;
-
- // JCas
- jcas.reset();
- jcas = null;
- }
-
- /**
- * Resets the CAS (Common Analysis System), emptying it of all content.
- */
- public static void resetCAS() {
- if (jcas != null) {
- jcas.reset();
- }
- }
-
- /**
- * Resets the AE (AnalysisEngine), releasing all resources held by the
- * current AE.
- */
- public static void resetAE() {
- if (ae != null) {
- ae.destroy();
- ae = null;
- }
-
- if (jcas != null) {
- jcas.reset();
- jcas = null;
- }
- }
+ /**
+ * Returns a new UIMA Analysis Engine (AE). This method ensures that
only
+ * one instance of an AE is created.
+ *
+ * <p>
+ * An Analysis Engine is a component responsible for analyzing
unstructured
+ * information, discovering and representing semantic content.
Unstructured
+ * information includes, but is not restricted to, text documents.
+ * </p>
+ *
+ * @param aeDescriptor
+ * pathname for XML file including an
AnalysisEngineDescription
+ * that contains all of the information needed to
instantiate and
+ * use an AnalysisEngine.
+ * @param umlsUser
+ * UMLS username for NLM database
+ * @param umlsPass
+ * UMLS password for NLM database
+ * @return an Analysis Engine for analyzing unstructured information.
+ * @throws IOException
+ * if any I/O error occurs.
+ * @throws InvalidXMLException
+ * if the input XML is not valid or does not specify a valid
+ * ResourceSpecifier.
+ * @throws ResourceInitializationException
+ * if a failure occurred during production of the resource.
+ * @throws URISyntaxException
+ * if URL of the resource is not formatted strictly
according to
+ * to RFC2396 and cannot be converted to a URI.
+ */
+ public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
+ String umlsUser, String umlsPass) throws IOException,
+ InvalidXMLException, ResourceInitializationException,
+ URISyntaxException {
+ // UMLS user ID and password.
+ String aeDescriptorPath =
CTAKESUtils.class.getResource(aeDescriptor)
+ .toURI().getPath();
+
+ // get Resource Specifier from XML
+ XMLInputSource aeIputSource = new
XMLInputSource(aeDescriptorPath);
+ ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser()
+ .parseResourceSpecifier(aeIputSource);
+
+ // UMLS user ID and password
+ if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass !=
null)
+ && (!umlsPass.isEmpty())) {
+ /*
+ * It is highly recommended that you change UMLS
credentials in the
+ * XML configuration file instead of giving user and
password using
+ * CTAKESConfig.
+ */
+ System.setProperty(CTAKES_UMLS_USER, umlsUser);
+ System.setProperty(CTAKES_UMLS_PASS, umlsPass);
+ }
+
+ // create AE
+ AnalysisEngine ae =
UIMAFramework.produceAnalysisEngine(aeSpecifier);
+
+ return ae;
+ }
+
+ /**
+ * Returns a new JCas () appropriate for the given Analysis Engine. This
+ * method ensures that only one instance of a JCas is created. A Jcas
is a
+ * Java Cover Classes based Object-oriented CAS (Common Analysis System)
+ * API.
+ *
+ * <p>
+ * Important: It is highly recommended that you reuse CAS objects rather
+ * than creating new CAS objects prior to each analysis. This is
because CAS
+ * objects may be expensive to create and may consume a significant
amount
+ * of memory.
+ * </p>
+ *
+ * @param ae
+ * AnalysisEngine used to create an appropriate JCas object.
+ * @return a JCas object appropriate for the given AnalysisEngine.
+ * @throws ResourceInitializationException
+ * if a CAS could not be created because this
AnalysisEngine's
+ * CAS metadata (type system, type priorities, or FS
indexes)
+ * are invalid.
+ */
+ public static JCas getJCas(AnalysisEngine ae)
+ throws ResourceInitializationException {
+ JCas jcas = ae.newJCas();
+
+ return jcas;
+ }
+
+ /**
+ * Serializes a CAS in the given format.
+ *
+ * @param jcas
+ * CAS (Common Analysis System) to be serialized.
+ * @param type
+ * type of cTAKES (UIMA) serializer used to write CAS.
+ * @param prettyPrint
+ * {@code true} to do pretty printing of output.
+ * @param stream
+ * {@see OutputStream} object used to print out information
+ * extracted by using cTAKES.
+ * @throws SAXException
+ * if there was a SAX exception.
+ * @throws IOException
+ * if any I/O error occurs.
+ */
+ public static void serialize(JCas jcas, CTAKESSerializer type, boolean
prettyPrint,
+ OutputStream stream) throws SAXException, IOException {
+ if (type == CTAKESSerializer.XCAS) {
+ XCASSerializer.serialize(jcas.getCas(), stream,
prettyPrint);
+ } else if (type == CTAKESSerializer.XMI) {
+ XmiCasSerializer.serialize(jcas.getCas(),
jcas.getTypeSystem(),
+ stream, prettyPrint, new
XmiSerializationSharedData());
+ } else {
+ XmlCasSerializer.serialize(jcas.getCas(),
jcas.getTypeSystem(),
+ stream);
+ }
+ }
+
+ /**
+ * Returns the annotation value based on the given annotation type.
+ *
+ * @param annotation
+ * {@see IdentifiedAnnotation} object.
+ * @param property
+ * {@see CTAKESAnnotationProperty} enum used to identify the
+ * annotation type.
+ * @return the annotation value.
+ */
+ public static String getAnnotationProperty(IdentifiedAnnotation
annotation,
+ CTAKESAnnotationProperty property) {
+ String value = null;
+ if (property == CTAKESAnnotationProperty.BEGIN) {
+ value = Integer.toString(annotation.getBegin());
+ } else if (property == CTAKESAnnotationProperty.END) {
+ value = Integer.toString(annotation.getEnd());
+ } else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
+ value = Boolean.toString(annotation.getConditional());
+ } else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
+ value = Float.toString(annotation.getConfidence());
+ } else if (property ==
CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
+ value =
Integer.toString(annotation.getDiscoveryTechnique());
+ } else if (property == CTAKESAnnotationProperty.GENERIC) {
+ value = Boolean.toString(annotation.getGeneric());
+ } else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
+ value = Integer.toString(annotation.getHistoryOf());
+ } else if (property == CTAKESAnnotationProperty.ID) {
+ value = Integer.toString(annotation.getId());
+ } else if (property ==
CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
+ FSArray mentions = annotation.getOntologyConceptArr();
+ StringBuilder sb = new StringBuilder();
+ if (mentions != null) {
+ for (int i = 0; i < mentions.size(); i++) {
+ if (mentions.get(i) instanceof
UmlsConcept) {
+ UmlsConcept concept =
(UmlsConcept) mentions.get(i);
+ sb.append(concept.getCui());
+ if (i < mentions.size() - 1) {
+ sb.append(",");
+ }
+ }
+ }
+ }
+ value = sb.toString();
+ } else if (property == CTAKESAnnotationProperty.POLARITY) {
+ value = Integer.toString(annotation.getPolarity());
+ }
+ return value;
+ }
+
+ /**
+ * Resets cTAKES objects, if created. This method ensures that new
cTAKES
+ * objects (a.k.a., Analysis Engine and JCas) will be created if
getters of
+ * this class are called.
+ *
+ * @param ae UIMA Analysis Engine
+ * @param jcas JCas object
+ */
+ public static void reset(AnalysisEngine ae, JCas jcas) {
+ // Analysis Engine
+ resetAE(ae);
+
+ // JCas
+ resetCAS(jcas);
+ jcas = null;
+ }
+
+ /**
+ * Resets the CAS (Common Analysis System), emptying it of all content.
+ *
+ * @param jcas JCas object
+ */
+ public static void resetCAS(JCas jcas) {
+ if (jcas != null) {
+ jcas.reset();
+ }
+ }
+
+ /**
+ * Resets the AE (AnalysisEngine), releasing all resources held by the
+ * current AE.
+ *
+ * @param ae UIMA Analysis Engine
+ */
+ public static void resetAE(AnalysisEngine ae) {
+ if (ae != null) {
+ ae.destroy();
+ ae = null;
+ }
+ }
}