Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1765496&r1=1765495&r2=1765496&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original) +++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Tue Oct 18 18:57:29 2016 @@ -109,297 +109,297 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; public abstract class Evaluation_ImplBase<STATISTICS_TYPE> extends - org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> { +org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> { - private static Logger LOGGER = Logger.getLogger( Evaluation_ImplBase.class ); + static Logger LOGGER = Logger.getLogger( Evaluation_ImplBase.class ); - private static final String LOOKUP_PATH = "/org/apache/ctakes/temporal/badEEContainNotes.txt"; + private static final String LOOKUP_PATH = "/org/apache/ctakes/temporal/badEEContainNotes.txt"; - private static boolean isTraining; + private static boolean isTraining; - public static HashSet<String> badNotes; - - public static final String GOLD_VIEW_NAME = "GoldView"; - - public static final String PROB_VIEW_NAME = "ProbView"; - - public enum XMLFormat {Knowtator, Anafora, I2B2} - - public enum Subcorpus {Colon, Brain, DeepPhe} - - public static interface Options { - - @Option( longName = "text", defaultToNull = true ) - public File getRawTextDirectory(); - - @Option( longName = "xml" ) - public File getXMLDirectory(); - - @Option( longName = "format", defaultValue = "Anafora" ) - public XMLFormat getXMLFormat(); - - @Option( longName = "subcorpus", defaultValue = "Colon" ) - public Subcorpus getSubcorpus(); - - @Option( longName = "xmi" ) - public File getXMIDirectory(); - - @Option( longName = "patients" ) - public CommandLine.IntegerRanges getPatients(); - -// @Option( longName = "train-remainders", defaultValue = "0-2" ) -// public CommandLine.IntegerRanges getTrainRemainders(); -// -// @Option( longName = "dev-remainders", defaultValue = "3" ) -// public CommandLine.IntegerRanges getDevRemainders(); -// -// @Option( longName = "test-remainders", defaultValue = "4-5" ) -// public CommandLine.IntegerRanges getTestRemainders(); - - @Option( longName = "train-remainders", defaultValue = "0-3" ) - public CommandLine.IntegerRanges getTrainRemainders(); - - @Option( longName = "dev-remainders", defaultValue = "4-5" ) - public CommandLine.IntegerRanges getDevRemainders(); - - @Option( longName = "test-remainders", defaultValue = "6-7" ) - public CommandLine.IntegerRanges getTestRemainders(); - - @Option( longName = "treebank", defaultToNull = true ) - public File getTreebankDirectory(); - - @Option - public boolean getUseGoldTrees(); - - @Option - public boolean getGrid(); - - @Option - public boolean getPrintErrors(); - - @Option - public boolean getPrintOverlappingSpans(); - - @Option - public boolean getTest(); - - @Option( longName = "kernelParams", defaultToNull = true ) - public String getKernelParams(); - - @Option( defaultToNull = true ) - public String getI2B2Output(); - - @Option( defaultToNull = true ) - public String getAnaforaOutput(); - - @Option - public boolean getSkipTrain(); - - @Option(longName = "skipWrite") - public boolean getSkipDataWriting(); - } - - public static List<Integer> getTrainItems( Options options ) { - List<Integer> patientSets = options.getPatients().getList(); - List<Integer> trainItems = THYMEData.getPatientSets( patientSets, options.getTrainRemainders().getList() ); - if ( options.getTest() ) { - trainItems.addAll( THYMEData.getPatientSets( patientSets, options.getDevRemainders().getList() ) ); - } - return trainItems; - } - - public static List<Integer> getTestItems( Options options ) { - List<Integer> patientSets = options.getPatients().getList(); - List<Integer> testItems; - if ( options.getTest() ) { - testItems = THYMEData.getPatientSets( patientSets, options.getTestRemainders().getList() ); - } else { - testItems = THYMEData.getPatientSets( patientSets, options.getDevRemainders().getList() ); - } - return testItems; - } - - protected File rawTextDirectory; - - protected File xmlDirectory; - - protected XMLFormat xmlFormat; - - protected Subcorpus subcorpus; - - protected File xmiDirectory; - - private boolean xmiExists; - - protected File treebankDirectory; - - protected boolean printErrors = false; - - protected boolean printOverlapping = false; - - protected String i2b2Output = null; - - protected String anaforaOutput = null; - - protected String[] kernelParams; - - public Evaluation_ImplBase( - File baseDirectory, - File rawTextDirectory, - File xmlDirectory, - XMLFormat xmlFormat, - Subcorpus subcorpus, - File xmiDirectory, - File treebankDirectory ) { - super( baseDirectory ); - this.rawTextDirectory = rawTextDirectory; - this.xmlDirectory = xmlDirectory; - this.xmlFormat = xmlFormat; - this.subcorpus = subcorpus; - this.xmiDirectory = xmiDirectory; - this.xmiExists = this.xmiDirectory.exists() && this.xmiDirectory.listFiles().length > 0; - this.treebankDirectory = treebankDirectory; - - this.isTraining = true; - this.badNotes = new HashSet<>(); - URL url = TimeWordsExtractor.class.getResource( LOOKUP_PATH ); - try ( BufferedReader br = new BufferedReader( new FileReader( url.getFile() ) ) ) { - String line; - while ( (line = br.readLine()) != null ) { - badNotes.add( line.trim() ); - } - } catch ( FileNotFoundException e ) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch ( IOException e ) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - - public void setI2B2Output( String outDir ) { - i2b2Output = outDir; - } - - public void prepareXMIsFor( List<Integer> patientSets ) throws Exception { - boolean needsXMIs = false; - for ( File textFile : this.getFilesFor( patientSets ) ) { - if ( !getXMIFile( this.xmiDirectory, textFile ).exists() ) { - needsXMIs = true; - break; - } - } - if ( needsXMIs ) { - CollectionReader reader = this.getCollectionReader( patientSets ); - AnalysisEngine engine = this.getXMIWritingPreprocessorAggregateBuilder().createAggregate(); - SimplePipeline.runPipeline( reader, engine ); - } - this.xmiExists = true; - } - - private List<File> getFilesFor( List<Integer> patientSets ) throws FileNotFoundException { - List<File> files = new ArrayList<>(); - if ( this.xmlFormat == XMLFormat.Anafora ) { - Set<String> ids = new HashSet<>(); - for ( Integer set : patientSets ) { - if ( this.subcorpus == Subcorpus.Colon ) { - ids.add( String.format( "ID%03d", set ) ); - } else if ( this.subcorpus == Subcorpus.DeepPhe ) { - ids.add( String.format( "patient%02d", set ) ); - } else { - ids.add( String.format( "doc%04d", set ) ); - } - } - int filePrefixLen = 5; // Colon: "ID\d{3}" - if ( this.subcorpus == Subcorpus.Brain ) { - filePrefixLen = 7; // Brain: "doc\d{4}" - } else if ( this.subcorpus == Subcorpus.DeepPhe ) { - filePrefixLen = 9; // deepPhe: "patient\d{2}" - } - if ( this.subcorpus == Subcorpus.DeepPhe ) { - for ( File dir : this.xmlDirectory.listFiles() ) { - if ( dir.isDirectory() ) { - if ( ids.contains( dir.getName().substring( 0, filePrefixLen ) ) ) { - File file = new File( dir, dir.getName() ); - if ( file.exists() ) { - files.add( file ); - } else { - LOGGER.warn( "Missing note: " + file ); - } - } - } - } - } else { - for ( String section : THYMEData.SECTIONS ) { - File xmlSubdir = new File( this.xmlDirectory, section ); - for ( File dir : xmlSubdir.listFiles() ) { - if ( dir.isDirectory() ) { - if ( ids.contains( dir.getName().substring( 0, filePrefixLen ) ) ) { - File file = new File( dir, dir.getName() ); - if ( file.exists() ) { - files.add( file ); - } else { - LOGGER.warn( "Missing note: " + file ); - } - } - } - } - } - } - } else if ( this.xmlFormat == XMLFormat.I2B2 ) { - File trainDir = new File( this.xmlDirectory, "training" ); - File testDir = new File( this.xmlDirectory, "test" ); - for ( Integer pt : patientSets ) { - File xmlTrain = new File( trainDir, pt + ".xml" ); - File train = new File( trainDir, pt + ".xml.txt" ); - if ( train.exists() ) { - if ( xmlTrain.exists() ) { - files.add( train ); - } else { - System.err.println( "Text file in training has no corresponding xml -- skipping: " + train ); - } - } - File xmlTest = new File( testDir, pt + ".xml" ); - File test = new File( testDir, pt + ".xml.txt" ); - if ( xmlTest.exists() ) { - if ( test.exists() ) { - files.add( test ); - } else { - throw new FileNotFoundException( "Could not find the test text file -- for cTAKES usage you must copy the text files into the xml directory for the test set." ); - } - } - assert !(train.exists() && test.exists()); - } - } else if ( xmlFormat == XMLFormat.Knowtator ) { - LOGGER.warn( "This is an old annotation format -- please upgrade to using anafora files." ); - for ( Integer set : patientSets ) { - final int setNum = set; - for ( File file : rawTextDirectory.listFiles( new FilenameFilter() { - @Override - public boolean accept( File dir, String name ) { - return name.contains( String.format( "ID%03d", setNum ) ); - } - } ) ) { - // skip hidden files like .svn - if ( !file.isHidden() ) { - files.add( file ); - } - } - } - } else { - LOGGER.error( "Unknown data format -- please specify Anafora, i2b2, or Knowtator format." ); - } - return files; - } - - @Override - protected CollectionReader getCollectionReader( List<Integer> patientSets ) throws Exception { - List<File> collectedFiles = this.getFilesFor( patientSets ); - Collections.sort(collectedFiles); -// for(File file : collectedFiles){ -// System.err.println(file.getName()); -// } - /** + public static HashSet<String> badNotes; + + public static final String GOLD_VIEW_NAME = "GoldView"; + + public static final String PROB_VIEW_NAME = "ProbView"; + + public enum XMLFormat {Knowtator, Anafora, I2B2} + + public enum Subcorpus {Colon, Brain, DeepPhe} + + public static interface Options { + + @Option( longName = "text", defaultToNull = true ) + public File getRawTextDirectory(); + + @Option( longName = "xml" ) + public File getXMLDirectory(); + + @Option( longName = "format", defaultValue = "Anafora" ) + public XMLFormat getXMLFormat(); + + @Option( longName = "subcorpus", defaultValue = "Colon" ) + public Subcorpus getSubcorpus(); + + @Option( longName = "xmi" ) + public File getXMIDirectory(); + + @Option( longName = "patients" ) + public CommandLine.IntegerRanges getPatients(); + + // @Option( longName = "train-remainders", defaultValue = "0-2" ) + // public CommandLine.IntegerRanges getTrainRemainders(); + // + // @Option( longName = "dev-remainders", defaultValue = "3" ) + // public CommandLine.IntegerRanges getDevRemainders(); + // + // @Option( longName = "test-remainders", defaultValue = "4-5" ) + // public CommandLine.IntegerRanges getTestRemainders(); + + @Option( longName = "train-remainders", defaultValue = "0-3" ) + public CommandLine.IntegerRanges getTrainRemainders(); + + @Option( longName = "dev-remainders", defaultValue = "4-5" ) + public CommandLine.IntegerRanges getDevRemainders(); + + @Option( longName = "test-remainders", defaultValue = "6-7" ) + public CommandLine.IntegerRanges getTestRemainders(); + + @Option( longName = "treebank", defaultToNull = true ) + public File getTreebankDirectory(); + + @Option + public boolean getUseGoldTrees(); + + @Option + public boolean getGrid(); + + @Option + public boolean getPrintErrors(); + + @Option + public boolean getPrintOverlappingSpans(); + + @Option + public boolean getTest(); + + @Option( longName = "kernelParams", defaultToNull = true ) + public String getKernelParams(); + + @Option( defaultToNull = true ) + public String getI2B2Output(); + + @Option( defaultToNull = true ) + public String getAnaforaOutput(); + + @Option + public boolean getSkipTrain(); + + @Option(longName = "skipWrite") + public boolean getSkipDataWriting(); + } + + public static List<Integer> getTrainItems( Options options ) { + List<Integer> patientSets = options.getPatients().getList(); + List<Integer> trainItems = THYMEData.getPatientSets( patientSets, options.getTrainRemainders().getList() ); + if ( options.getTest() ) { + trainItems.addAll( THYMEData.getPatientSets( patientSets, options.getDevRemainders().getList() ) ); + } + return trainItems; + } + + public static List<Integer> getTestItems( Options options ) { + List<Integer> patientSets = options.getPatients().getList(); + List<Integer> testItems; + if ( options.getTest() ) { + testItems = THYMEData.getPatientSets( patientSets, options.getTestRemainders().getList() ); + } else { + testItems = THYMEData.getPatientSets( patientSets, options.getDevRemainders().getList() ); + } + return testItems; + } + + protected File rawTextDirectory; + + protected File xmlDirectory; + + protected XMLFormat xmlFormat; + + protected Subcorpus subcorpus; + + protected File xmiDirectory; + + private boolean xmiExists; + + protected File treebankDirectory; + + protected boolean printErrors = false; + + protected boolean printOverlapping = false; + + protected String i2b2Output = null; + + protected String anaforaOutput = null; + + protected String[] kernelParams; + + public Evaluation_ImplBase( + File baseDirectory, + File rawTextDirectory, + File xmlDirectory, + XMLFormat xmlFormat, + Subcorpus subcorpus, + File xmiDirectory, + File treebankDirectory ) { + super( baseDirectory ); + this.rawTextDirectory = rawTextDirectory; + this.xmlDirectory = xmlDirectory; + this.xmlFormat = xmlFormat; + this.subcorpus = subcorpus; + this.xmiDirectory = xmiDirectory; + this.xmiExists = this.xmiDirectory.exists() && this.xmiDirectory.listFiles().length > 0; + this.treebankDirectory = treebankDirectory; + + this.isTraining = true; + this.badNotes = new HashSet<>(); + URL url = TimeWordsExtractor.class.getResource( LOOKUP_PATH ); + try ( BufferedReader br = new BufferedReader( new FileReader( url.getFile() ) ) ) { + String line; + while ( (line = br.readLine()) != null ) { + badNotes.add( line.trim() ); + } + } catch ( FileNotFoundException e ) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch ( IOException e ) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public void setI2B2Output( String outDir ) { + i2b2Output = outDir; + } + + public void prepareXMIsFor( List<Integer> patientSets ) throws Exception { + boolean needsXMIs = false; + for ( File textFile : this.getFilesFor( patientSets ) ) { + if ( !getXMIFile( this.xmiDirectory, textFile ).exists() ) { + needsXMIs = true; + break; + } + } + if ( needsXMIs ) { + CollectionReader reader = this.getCollectionReader( patientSets ); + AnalysisEngine engine = this.getXMIWritingPreprocessorAggregateBuilder().createAggregate(); + SimplePipeline.runPipeline( reader, engine ); + } + this.xmiExists = true; + } + + private List<File> getFilesFor( List<Integer> patientSets ) throws FileNotFoundException { + List<File> files = new ArrayList<>(); + if ( this.xmlFormat == XMLFormat.Anafora ) { + Set<String> ids = new HashSet<>(); + for ( Integer set : patientSets ) { + if ( this.subcorpus == Subcorpus.Colon ) { + ids.add( String.format( "ID%03d", set ) ); + } else if ( this.subcorpus == Subcorpus.DeepPhe ) { + ids.add( String.format( "patient%02d", set ) ); + } else { + ids.add( String.format( "doc%04d", set ) ); + } + } + int filePrefixLen = 5; // Colon: "ID\d{3}" + if ( this.subcorpus == Subcorpus.Brain ) { + filePrefixLen = 7; // Brain: "doc\d{4}" + } else if ( this.subcorpus == Subcorpus.DeepPhe ) { + filePrefixLen = 9; // deepPhe: "patient\d{2}" + } + if ( this.subcorpus == Subcorpus.DeepPhe ) { + for ( File dir : this.xmlDirectory.listFiles() ) { + if ( dir.isDirectory() ) { + if ( ids.contains( dir.getName().substring( 0, filePrefixLen ) ) ) { + File file = new File( dir, dir.getName() ); + if ( file.exists() ) { + files.add( file ); + } else { + LOGGER.warn( "Missing note: " + file ); + } + } + } + } + } else { + for ( String section : THYMEData.SECTIONS ) { + File xmlSubdir = new File( this.xmlDirectory, section ); + for ( File dir : xmlSubdir.listFiles() ) { + if ( dir.isDirectory() ) { + if ( ids.contains( dir.getName().substring( 0, filePrefixLen ) ) ) { + File file = new File( dir, dir.getName() ); + if ( file.exists() ) { + files.add( file ); + } else { + LOGGER.warn( "Missing note: " + file ); + } + } + } + } + } + } + } else if ( this.xmlFormat == XMLFormat.I2B2 ) { + File trainDir = new File( this.xmlDirectory, "training" ); + File testDir = new File( this.xmlDirectory, "test" ); + for ( Integer pt : patientSets ) { + File xmlTrain = new File( trainDir, pt + ".xml" ); + File train = new File( trainDir, pt + ".xml.txt" ); + if ( train.exists() ) { + if ( xmlTrain.exists() ) { + files.add( train ); + } else { + System.err.println( "Text file in training has no corresponding xml -- skipping: " + train ); + } + } + File xmlTest = new File( testDir, pt + ".xml" ); + File test = new File( testDir, pt + ".xml.txt" ); + if ( xmlTest.exists() ) { + if ( test.exists() ) { + files.add( test ); + } else { + throw new FileNotFoundException( "Could not find the test text file -- for cTAKES usage you must copy the text files into the xml directory for the test set." ); + } + } + assert !(train.exists() && test.exists()); + } + } else if ( xmlFormat == XMLFormat.Knowtator ) { + LOGGER.warn( "This is an old annotation format -- please upgrade to using anafora files." ); + for ( Integer set : patientSets ) { + final int setNum = set; + for ( File file : rawTextDirectory.listFiles( new FilenameFilter() { + @Override + public boolean accept( File dir, String name ) { + return name.contains( String.format( "ID%03d", setNum ) ); + } + } ) ) { + // skip hidden files like .svn + if ( !file.isHidden() ) { + files.add( file ); + } + } + } + } else { + LOGGER.error( "Unknown data format -- please specify Anafora, i2b2, or Knowtator format." ); + } + return files; + } + + @Override + protected CollectionReader getCollectionReader( List<Integer> patientSets ) throws Exception { + List<File> collectedFiles = this.getFilesFor( patientSets ); + Collections.sort(collectedFiles); + // for(File file : collectedFiles){ + // System.err.println(file.getName()); + // } + /** if(isTraining){ final Collection<File> filesToRemove = new HashSet<>(); for ( File xmiFile : collectedFiles ) { @@ -412,938 +412,940 @@ public abstract class Evaluation_ImplBas collectedFiles.removeAll( filesToRemove ); } isTraining = false; - */ - return UriCollectionReader.getCollectionReaderFromFiles( collectedFiles ); - } - - protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception { - return this.xmiExists - ? this.getXMIReadingPreprocessorAggregateBuilder() - : this.getXMIWritingPreprocessorAggregateBuilder(); - } - - protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException { - AggregateBuilder aggregateBuilder = new AggregateBuilder(); - aggregateBuilder.add( UriToDocumentTextAnnotator.getDescription() ); - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - XMIReader.class, - XMIReader.PARAM_XMI_DIRECTORY, - this.xmiDirectory ) ); - return aggregateBuilder; - } - - protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder() - throws Exception { - AggregateBuilder aggregateBuilder = new AggregateBuilder(); - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( UriToDocumentTextAnnotatorCtakes.class ) ); - - // read manual annotations into gold view - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - ViewCreatorAnnotator.class, - ViewCreatorAnnotator.PARAM_VIEW_NAME, - GOLD_VIEW_NAME ) ); - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - ViewTextCopierAnnotator.class, - ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME, - CAS.NAME_DEFAULT_SOFA, - ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME, - GOLD_VIEW_NAME ) ); - switch ( this.xmlFormat ) { - case Anafora: - if(this.subcorpus == Subcorpus.DeepPhe){ - aggregateBuilder.add( - AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class, - THYMEAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY, - this.xmlDirectory, - THYMEAnaforaXMLReader.PARAM_ANAFORA_XML_SUFFIXES, - new String[]{} ), - CAS.NAME_DEFAULT_SOFA, - GOLD_VIEW_NAME ); - }else{ - aggregateBuilder.add( - THYMEAnaforaXMLReader.getDescription( this.xmlDirectory ), - CAS.NAME_DEFAULT_SOFA, - GOLD_VIEW_NAME ); - } - break; - case Knowtator: - aggregateBuilder.add( - THYMEKnowtatorXMLReader.getDescription( this.xmlDirectory ), - CAS.NAME_DEFAULT_SOFA, - GOLD_VIEW_NAME ); - break; - case I2B2: - aggregateBuilder.add( - I2B2TemporalXMLReader.getDescription( this.xmlDirectory ), - CAS.NAME_DEFAULT_SOFA, - GOLD_VIEW_NAME ); - break; - } - - // identify segments - if(this.subcorpus == Subcorpus.DeepPhe){ - aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PittHeaderAnnotator.class)); - }else{ - aggregateBuilder - .add( AnalysisEngineFactory.createEngineDescription( SegmentsFromBracketedSectionTagsAnnotator.class ) ); - } - // identify sentences - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - SentenceDetector.class, - SentenceDetector.SD_MODEL_FILE_PARAM, - "org/apache/ctakes/core/sentdetect/sd-med-model.zip" ) ); -// aggregateBuilder.add(SentenceDetectorAnnotator.getDescription(FileLocator.locateFile("org/apache/ctakes/core/sentdetect/model.jar").getPath())); - - // identify tokens - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TokenizerAnnotatorPTB.class ) ); - // merge some tokens - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ContextDependentTokenizerAnnotator.class ) ); - - // identify part-of-speech tags - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - POSTagger.class, - TypeSystemDescriptionFactory.createTypeSystemDescription(), - TypePrioritiesFactory.createTypePriorities( Segment.class, Sentence.class, BaseToken.class ), - POSTagger.POS_MODEL_FILE_PARAM, - "org/apache/ctakes/postagger/models/mayo-pos.zip" ) ); - - // identify chunks - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - Chunker.class, - Chunker.CHUNKER_MODEL_FILE_PARAM, - FileLocator.locateFile( "org/apache/ctakes/chunker/models/chunker-model.zip" ), - Chunker.CHUNKER_CREATOR_CLASS_PARAM, - DefaultChunkCreator.class ) ); - - // identify UMLS named entities - - // adjust NP in NP NP to span both - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - ChunkAdjuster.class, - ChunkAdjuster.PARAM_CHUNK_PATTERN, - new String[] { "NP", "NP" }, - ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, - 1 ) ); - // adjust NP in NP PP NP to span all three - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - ChunkAdjuster.class, - ChunkAdjuster.PARAM_CHUNK_PATTERN, - new String[] { "NP", "PP", "NP" }, - ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, - 2 ) ); - // add lookup windows for each NP - aggregateBuilder - .add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) ); - // maximize lookup windows - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - OverlapAnnotator.class, - "A_ObjectClass", - LookupWindowAnnotation.class, - "B_ObjectClass", - LookupWindowAnnotation.class, - "OverlapType", - "A_ENV_B", - "ActionType", - "DELETE", - "DeleteAction", - new String[] { "selector=B" } ) ); - // add UMLS on top of lookup windows - aggregateBuilder.add( DefaultJCasTermAnnotator.createAnnotatorDescription() ); - - aggregateBuilder.add( LvgAnnotator.createAnnotatorDescription() ); - - // add dependency parser - aggregateBuilder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() ); - - // add semantic role labeler - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ClearNLPSemanticRoleLabelerAE.class ) ); - - // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps - if ( this.treebankDirectory != null ) { - aggregateBuilder.add( THYMETreebankReader.getDescription( this.treebankDirectory ) ); - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TimexAnnotationCorrector.class ) ); - } else { - // add ctakes constituency parses to system view - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class, - ConstituencyParser.PARAM_MODEL_FILENAME, - "org/apache/ctakes/constituency/parser/models/thyme.bin" ) ); - } - // write out the CAS after all the above annotations - aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( - XMIWriter.class, - XMIWriter.PARAM_XMI_DIRECTORY, - this.xmiDirectory ) ); - - return aggregateBuilder; - } - - public static <T extends Annotation> List<T> selectExact( JCas jCas, Class<T> annotationClass, Segment segment ) { - List<T> annotations = Lists.newArrayList(); - for ( T annotation : JCasUtil.selectCovered( jCas, annotationClass, segment ) ) { - if ( annotation.getClass().equals( annotationClass ) ) { - annotations.add( annotation ); - } - } - return annotations; - } - - public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase { - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - for ( Chunk chunk : JCasUtil.select( jCas, Chunk.class ) ) { - if ( chunk.getChunkType().equals( "NP" ) ) { - new LookupWindowAnnotation( jCas, chunk.getBegin(), chunk.getEnd() ).addToIndexes(); - } - } - } - } - - public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase { - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - List<LookupWindowAnnotation> lws = new ArrayList<>( JCasUtil.select( jCas, LookupWindowAnnotation.class ) ); - // we'll navigate backwards so that as we delete things we shorten the list from the back - for ( int i = lws.size() - 2; i >= 0; i-- ) { - LookupWindowAnnotation lw1 = lws.get( i ); - LookupWindowAnnotation lw2 = lws.get( i + 1 ); - if ( lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd() ) { - /// lw1 envelops or encloses lw2 - lws.remove( i + 1 ); - lw2.removeFromIndexes(); - } - } - - } - - } - - public static class EntityMentionRemover extends JCasAnnotator_ImplBase { - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - for ( EntityMention mention : Lists.newArrayList( JCasUtil.select( jCas, EntityMention.class ) ) ) { - mention.removeFromIndexes(); - } - } - } - - public static class EventMentionRemover extends JCasAnnotator_ImplBase { - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - for ( EventMention mention : Lists.newArrayList( JCasUtil.select( jCas, EventMention.class ) ) ) { - mention.removeFromIndexes(); - } - } - } - - // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed - public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase { - private static Pattern SECTION_PATTERN = Pattern.compile( - "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])", - Pattern.DOTALL ); - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - boolean foundSections = false; - Matcher matcher = SECTION_PATTERN.matcher( jCas.getDocumentText() ); - while ( matcher.find() ) { - Segment segment = new Segment( jCas ); - segment.setBegin( matcher.start() + matcher.group( 1 ).length() ); - segment.setEnd( matcher.end() - matcher.group( 3 ).length() ); - segment.setId( matcher.group( 2 ) ); - segment.addToIndexes(); - foundSections = true; - } - if ( !foundSections ) { - Segment segment = new Segment( jCas ); - segment.setBegin( 0 ); - segment.setEnd( jCas.getDocumentText().length() ); - segment.setId( "SIMPLE_SEGMENT" ); - segment.addToIndexes(); - } - } - } - - /** - * Grabs the document time from the header - */ - public static class PittHeaderAnnotator extends JCasAnnotator_ImplBase { - - /** - * Grabs the document time from the header - * {@inheritDoc} - */ - @Override - public void process( final JCas jcas ) throws AnalysisEngineProcessException { - String docText = jcas.getDocumentText(); - int headerEnd = docText.indexOf("\n", docText.indexOf("[Report de-identified")); - Segment mainSegment = new Segment(jcas, headerEnd+1, docText.length()-1); - mainSegment.setId("SIMPLE_SEGMENT"); - mainSegment.addToIndexes(); - } - } - - static File getXMIFile( File xmiDirectory, File textFile ) { - String fileName = textFile.getName(); - if(!fileName.contains(".xmi")){ - fileName += ".xmi"; - } - return new File( xmiDirectory, fileName);// + ".xmi" - } - - static File getXMIFile( File xmiDirectory, JCas jCas ) throws AnalysisEngineProcessException { - return getXMIFile( xmiDirectory, new File( ViewUriUtil.getURI( jCas ).getPath() ) ); - } - - public static class XMIWriter extends JCasAnnotator_ImplBase { - - public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; - - @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory = true ) - private File xmiDirectory; - - @Override - public void initialize( UimaContext context ) throws ResourceInitializationException { - super.initialize( context ); - if ( !this.xmiDirectory.exists() ) { - this.xmiDirectory.mkdirs(); - } - } - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - File xmiFile = getXMIFile( this.xmiDirectory, jCas ); - try { - FileOutputStream outputStream = new FileOutputStream( xmiFile ); - try { - XmiCasSerializer serializer = new XmiCasSerializer( jCas.getTypeSystem() ); - ContentHandler handler = new XMLSerializer( outputStream, false ).getContentHandler(); - serializer.serialize( jCas.getCas(), handler ); - } finally { - outputStream.close(); - } - } catch ( SAXException e ) { - throw new AnalysisEngineProcessException( e ); - } catch ( IOException e ) { - throw new AnalysisEngineProcessException( e ); - } - } - } - - public static class XMIReader extends JCasAnnotator_ImplBase { - - public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; - - @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory = true ) - private File xmiDirectory; - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - File xmiFile = getXMIFile( this.xmiDirectory, jCas ); - try { - FileInputStream inputStream = new FileInputStream( xmiFile ); - try { - XmiCasDeserializer.deserialize( inputStream, jCas.getCas() ); - } finally { - inputStream.close(); - } - } catch ( SAXException e ) { - throw new AnalysisEngineProcessException( e ); - } catch ( IOException e ) { - throw new AnalysisEngineProcessException( e ); - } - } - } - - public static class TimexAnnotationCorrector extends JCasAnnotator_ImplBase { - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - JCas goldView, systemView; - try { - goldView = jCas.getView( GOLD_VIEW_NAME ); - systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA ); - } catch ( CASException e ) { - e.printStackTrace(); - throw new AnalysisEngineProcessException(); - } - for ( TimeMention mention : JCasUtil.select( goldView, TimeMention.class ) ) { - // for each time expression, get the treebank node with the same span. - List<TreebankNode> nodes = JCasUtil.selectCovered( systemView, TreebankNode.class, mention ); - TreebankNode sameSpanNode = null; - for ( TreebankNode node : nodes ) { - if ( node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd() ) { - sameSpanNode = node; - break; - } - } - if ( sameSpanNode != null ) { - // look at node at the position of the timex3. - if ( sameSpanNode.getNodeType().equals( "PP" ) ) { - // if it is a PP it should be moved down to the NP - int numChildren = sameSpanNode.getChildren().size(); - if ( numChildren == 2 && sameSpanNode.getChildren( 0 ).getNodeType().equals( "IN" ) && - sameSpanNode.getChildren( 1 ).getNodeType().equals( "NP" ) ) { - // move the time span to this node: - TreebankNode mentionNode = sameSpanNode.getChildren( numChildren - 1 ); - mention.setBegin( mentionNode.getBegin() ); - mention.setEnd( mentionNode.getEnd() ); - } - } - } else { - // if there is no matching tree span, see if the DT to the left would help. - // now adjust for missing DT to the left - List<TerminalTreebankNode> precedingPreterms = JCasUtil - .selectPreceding( systemView, TerminalTreebankNode.class, mention, 1 ); - if ( precedingPreterms != null && precedingPreterms.size() == 1 ) { - TerminalTreebankNode leftTerm = precedingPreterms.get( 0 ); - if ( leftTerm.getNodeType().equals( "DT" ) ) { - // now see if adding this would make it match a tree - List<TreebankNode> matchingNodes = JCasUtil - .selectCovered( systemView, TreebankNode.class, leftTerm.getBegin(), mention.getEnd() ); - for ( TreebankNode node : matchingNodes ) { - if ( node.getBegin() == leftTerm.getBegin() && node.getEnd() == mention.getEnd() ) { - sameSpanNode = node; - break; - } - } - if ( sameSpanNode != null ) { - // adding the DT to the left of th emention made it match a tree: - System.err.println( - "Adding DT: " + leftTerm.getCoveredText() + " to TIMEX: " + mention.getCoveredText() ); - mention.setBegin( leftTerm.getBegin() ); - } - } - } - } - } - } - } - - - public static class CopyFromGold extends JCasAnnotator_ImplBase { - - public static AnalysisEngineDescription getDescription( Class<?>... classes ) - throws ResourceInitializationException { - return AnalysisEngineFactory.createEngineDescription( - CopyFromGold.class, - CopyFromGold.PARAM_ANNOTATION_CLASSES, - classes ); - } - - public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses"; - - @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, mandatory = true ) - private Class<? extends TOP>[] annotationClasses; - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - JCas goldView, systemView; - try { - goldView = jCas.getView( GOLD_VIEW_NAME ); - systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA ); - } catch ( CASException e ) { - throw new AnalysisEngineProcessException( e ); - } - for ( Class<? extends TOP> annotationClass : this.annotationClasses ) { - for ( TOP annotation : Lists.newArrayList( JCasUtil.select( systemView, annotationClass ) ) ) { - if ( annotation.getClass().equals( annotationClass ) ) { - annotation.removeFromIndexes(); - } - } - } - CasCopier copier = new CasCopier( goldView.getCas(), systemView.getCas() ); - Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName( CAS.FEATURE_FULL_NAME_SOFA ); - for ( Class<? extends TOP> annotationClass : this.annotationClasses ) { - for ( TOP annotation : JCasUtil.select( goldView, annotationClass ) ) { - TOP copy = (TOP)copier.copyFs( annotation ); - if ( copy instanceof Annotation ) { - copy.setFeatureValue( sofaFeature, systemView.getSofa() ); - } - copy.addToIndexes( systemView ); - } - } - } - } - - public static class CopyFromSystem extends JCasAnnotator_ImplBase { - - public static AnalysisEngineDescription getDescription( Class<?>... classes ) - throws ResourceInitializationException { - return AnalysisEngineFactory.createEngineDescription( - CopyFromSystem.class, - CopyFromSystem.PARAM_ANNOTATION_CLASSES, - classes ); - } - - public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses"; - - @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, mandatory = true ) - private Class<? extends TOP>[] annotationClasses; - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - JCas goldView, systemView; - try { - goldView = jCas.getView( GOLD_VIEW_NAME ); - systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA ); - } catch ( CASException e ) { - throw new AnalysisEngineProcessException( e ); - } - for ( Class<? extends TOP> annotationClass : this.annotationClasses ) { - for ( TOP annotation : Lists.newArrayList( JCasUtil.select( goldView, annotationClass ) ) ) { - if ( annotation.getClass().equals( annotationClass ) ) { - annotation.removeFromIndexes(); - } - } - } - CasCopier copier = new CasCopier( systemView.getCas(), goldView.getCas() ); - Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName( CAS.FEATURE_FULL_NAME_SOFA ); - for ( Class<? extends TOP> annotationClass : this.annotationClasses ) { - for ( TOP annotation : JCasUtil.select( systemView, annotationClass ) ) { - TOP copy = (TOP)copier.copyFs( annotation ); - if ( copy instanceof Annotation ) { - copy.setFeatureValue( sofaFeature, goldView.getSofa() ); - } - copy.addToIndexes( goldView ); - } - } - } - } - - /* - * The following class overrides a ClearTK utility annotator class for reading - * a text file into a JCas. The code is copy/pasted so that one tiny modification - * can be made for this corpus -- replace a single odd character (0xc) with a - * space since it trips up xml output. - */ - public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator { - - @Override - public void process( JCas jCas ) throws AnalysisEngineProcessException { - URI uri = ViewUriUtil.getURI( jCas ); - String content; - - try { - content = CharStreams.toString( new InputStreamReader( uri.toURL().openStream() ) ); - content = content.replace( (char)0xc, ' ' ); - jCas.setSofaDataString( content, "text/plain" ); - } catch ( MalformedURLException e ) { - throw new AnalysisEngineProcessException( e ); - } catch ( IOException e ) { - throw new AnalysisEngineProcessException( e ); - } - } - } - - public static class WriteI2B2XML extends JCasAnnotator_ImplBase { - public static final String PARAM_OUTPUT_DIR = "PARAM_OUTPUT_DIR"; - @ConfigurationParameter( mandatory = true, description = "Output directory to write xml files to.", name = PARAM_OUTPUT_DIR ) - protected String outputDir; - - @Override - public void process( JCas jcas ) throws AnalysisEngineProcessException { - try { - // get the output file name from the input file name and output directory. - File outDir = new File( outputDir ); - if ( !outDir.exists() ) { - outDir.mkdirs(); - } - File inFile = new File( ViewUriUtil.getURI( jcas ) ); - String outFile = inFile.getName().replace( ".txt", "" ); - - // build the xml - DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); - DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); - Document doc = docBuilder.newDocument(); - Element rootElement = doc.createElement( "ClinicalNarrativeTemporalAnnotation" ); - Element textElement = doc.createElement( "TEXT" ); - Element tagsElement = doc.createElement( "TAGS" ); - textElement.setTextContent( jcas.getDocumentText() ); - rootElement.appendChild( textElement ); - rootElement.appendChild( tagsElement ); - doc.appendChild( rootElement ); - - Map<IdentifiedAnnotation, String> argToId = new HashMap<>(); - int id = 0; - for ( TimeMention timex : JCasUtil.select( jcas, TimeMention.class ) ) { - Element timexElement = doc.createElement( "TIMEX3" ); - String timexID = "T" + id; - id++; - argToId.put( timex, timexID ); - timexElement.setAttribute( "id", timexID ); - timexElement.setAttribute( "start", String.valueOf( timex.getBegin() + 1 ) ); - timexElement.setAttribute( "end", String.valueOf( timex.getEnd() + 1 ) ); - timexElement.setAttribute( "text", timex.getCoveredText() ); - timexElement.setAttribute( "type", "NA" ); - timexElement.setAttribute( "val", "NA" ); - timexElement.setAttribute( "mod", "NA" ); - tagsElement.appendChild( timexElement ); - } - - id = 0; - for ( EventMention event : JCasUtil.select( jcas, EventMention.class ) ) { - if ( event.getClass().equals( EventMention.class ) ) { - // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events - Element eventEl = doc.createElement( "EVENT" ); - String eventID = "E" + id; - id++; - argToId.put( event, eventID ); - eventEl.setAttribute( "id", eventID ); - eventEl.setAttribute( "start", String.valueOf( event.getBegin() + 1 ) ); - eventEl.setAttribute( "end", String.valueOf( event.getEnd() + 1 ) ); - eventEl.setAttribute( "text", event.getCoveredText() ); - eventEl.setAttribute( "modality", "NA" ); - eventEl.setAttribute( "polarity", "NA" ); - eventEl.setAttribute( "type", "NA" ); - tagsElement.appendChild( eventEl ); - } - } - - id = 0; - for ( TemporalTextRelation rel : JCasUtil.select( jcas, TemporalTextRelation.class ) ) { - Element linkEl = doc.createElement( "TLINK" ); - String linkID = "TL" + id; - id++; - linkEl.setAttribute( "id", linkID ); - Annotation arg1 = rel.getArg1().getArgument(); - linkEl.setAttribute( "fromID", argToId.get( arg1 ) ); - linkEl.setAttribute( "fromText", arg1.getCoveredText() ); - Annotation arg2 = rel.getArg2().getArgument(); - if ( arg2 != null ) { - linkEl.setAttribute( "toID", argToId.get( arg2 ) ); - linkEl.setAttribute( "toText", arg2.getCoveredText() ); - } else { - linkEl.setAttribute( "toID", "Discharge" ); - linkEl.setAttribute( "toText", "Discharge" ); - } - linkEl.setAttribute( "type", rel.getCategory() ); - tagsElement.appendChild( linkEl ); - } - - // boilerplate xml-writing code: - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); - transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); - transformer.setOutputProperty( OutputKeys.METHOD, "xml" ); - DOMSource source = new DOMSource( doc ); - StreamResult result = new StreamResult( new File( outputDir, outFile ) ); - transformer.transform( source, result ); - } catch ( ParserConfigurationException e ) { - e.printStackTrace(); - throw new AnalysisEngineProcessException( e ); - } catch ( TransformerConfigurationException e ) { - e.printStackTrace(); - throw new AnalysisEngineProcessException( e ); - } catch ( TransformerException e ) { - e.printStackTrace(); - throw new AnalysisEngineProcessException( e ); - } - - } - - } - - public static class WriteAnaforaXML extends JCasAnnotator_ImplBase { - public static final String PARAM_OUTPUT_DIR = "PARAM_OUTPUT_DIR"; - @ConfigurationParameter( mandatory = true, description = "Output directory to write xml files to.", name = PARAM_OUTPUT_DIR ) - protected String outputDir; - - public static final String PARAM_PROB_VIEW = "ProbView"; - @ConfigurationParameter(name=PARAM_PROB_VIEW, mandatory=false) - public String probViewname = null; - - @Override - public void process( JCas jcas ) throws AnalysisEngineProcessException { - try { - // get the output file name from the input file name and output directory. - - File inFile = new File( ViewUriUtil.getURI( jcas ) ); - String outFile = inFile.getName().replace( ".txt", "" ); - File outDir = new File( outputDir, outFile ); - if ( !outDir.exists() ) { - outDir.mkdirs(); - } - - - // get maps from ids to entities and relations: - JCas probView = (probViewname == null ? null : jcas.getView(probViewname)); - Map<Integer, List<EventMention>> mentions = probViewname == null? null : getMentionIdMap(jcas, probView); - Map<String, List<TemporalTextRelation>> rels = probViewname == null ? null : getRelationIdMap(probView); - - // build the xml - DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); - DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); - Document doc = docBuilder.newDocument(); - - Element rootElement = doc.createElement( "data" ); - - //info element - Element infoElement = doc.createElement( "info" ); - Element saveTime = doc.createElement( "savetime" ); - saveTime.setTextContent( "2015-0123-10:21" ); - Element progress = doc.createElement( "progress" ); - progress.setTextContent( "completed" ); - infoElement.appendChild( saveTime ); - infoElement.appendChild( progress ); - - //schema element - Element schema = doc.createElement( "schema" ); - schema.setAttribute( "path", "./" ); - schema.setAttribute( "protocol", "file" ); - schema.setTextContent( "temporal-schema.xml" ); - - Element annoElement = doc.createElement( "annotations" ); - Map<IdentifiedAnnotation, String> argToId = new HashMap<>(); - int id = 1; - for ( EventMention event : JCasUtil.select( jcas, EventMention.class ) ) { - if ( event.getClass().equals( EventMention.class ) ) { - // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events - Element eventEl = doc.createElement( "entity" ); - String eventID = id + "@e@" + outFile + "@system"; - id++; - argToId.put( event, eventID ); - Element idE = doc.createElement( "id" ); - idE.setTextContent( eventID ); - Element spanE = doc.createElement( "span" ); - spanE.setTextContent( String.valueOf( event.getBegin() ) + "," + String.valueOf( event.getEnd() ) ); - Element typeE = doc.createElement( "type" ); - typeE.setTextContent( "EVENT" ); - Element parentTE = doc.createElement( "parentsType" ); - parentTE.setTextContent( "TemporalEntities" ); - //add properties - Element property = doc.createElement( "properties" ); - Element docTimeRE = doc.createElement( "DocTimeRel" ); - String dtrContent = null; - if(probViewname == null){ - dtrContent = event.getEvent().getProperties().getDocTimeRel(); - }else{ - StringBuffer buff = new StringBuffer(); - for(EventMention probMention : mentions.get(event.getId())){ - buff.append(probMention.getEvent().getProperties().getDocTimeRel()); - buff.append(':'); - buff.append(probMention.getConfidence()); - buff.append("::"); - } - dtrContent = buff.substring(0, buff.length()-2); - } - docTimeRE.setTextContent( dtrContent ); - Element eventTypeE = doc.createElement( "Type" ); - eventTypeE.setTextContent( "N/A" ); - Element degreeE = doc.createElement( "Degree" ); - degreeE.setTextContent( "N/A" ); - Element polarityE = doc.createElement( "Polarity" ); - String polarity = "UNKNOWN"; - int polarityInt = event.getPolarity(); - if ( polarityInt == CONST.NE_POLARITY_NEGATION_ABSENT ) { - polarity = "POS"; - } else if ( polarityInt == CONST.NE_POLARITY_NEGATION_PRESENT ) { - polarity = "NEG"; - } - polarityE.setTextContent( polarity ); - Element ctexModE = doc.createElement( "ContextualModality" ); - ctexModE.setTextContent( event.getEvent().getProperties().getContextualModality() ); - Element ctexAspE = doc.createElement( "ContextualAspect" ); - ctexAspE.setTextContent( event.getEvent().getProperties().getContextualAspect() ); - Element permE = doc.createElement( "Permanence" ); - permE.setTextContent( "UNDETERMINED" ); - property.appendChild( docTimeRE ); - property.appendChild( polarityE ); - property.appendChild( degreeE ); - property.appendChild( eventTypeE ); - property.appendChild( ctexModE ); - property.appendChild( ctexAspE ); - property.appendChild( permE ); - eventEl.appendChild( idE ); - eventEl.appendChild( spanE ); - eventEl.appendChild( typeE ); - eventEl.appendChild( parentTE ); - eventEl.appendChild( property ); - annoElement.appendChild( eventEl ); - } - } - for ( TimeMention timex : JCasUtil.select( jcas, TimeMention.class ) ) { - Element timexElement = doc.createElement( "entity" ); - String timexID = id + "@e@" + outFile + "@system"; - id++;//18@e@ID006_clinic_016@gold - argToId.put( timex, timexID ); - Element idE = doc.createElement( "id" ); - idE.setTextContent( timexID ); - Element spanE = doc.createElement( "span" ); - spanE.setTextContent( String.valueOf( timex.getBegin() ) + "," + String.valueOf( timex.getEnd() ) ); - Element typeE = doc.createElement( "type" ); - Element parentTE = doc.createElement( "parentsType" ); - parentTE.setTextContent( "TemporalEntities" ); - //add properties - Element property = doc.createElement( "properties" ); - String timeClass = timex.getTimeClass(); - - //add normalized timex - String value = Utils.getTimexMLValue(timex.getCoveredText()); - if(value != null){ - property.setTextContent( value ); - } - - if ( timeClass!=null && (timeClass.equals( "DOCTIME" ) || timeClass.equals( "SECTIONTIME" ) ) ) { - typeE.setTextContent( timeClass ); - property.setTextContent( "" ); - } else { - typeE.setTextContent( "TIMEX3" ); - Element classE = doc.createElement( "Class" ); - classE.setTextContent( timeClass ); - property.appendChild( classE ); - } - - timexElement.appendChild( idE ); - timexElement.appendChild( spanE ); - timexElement.appendChild( typeE ); - timexElement.appendChild( property ); - annoElement.appendChild( timexElement ); - } - - - id = 1; - if(probViewname == null){ - for ( TemporalTextRelation rel : JCasUtil.select( jcas, TemporalTextRelation.class ) ) { - Annotation arg1 = rel.getArg1().getArgument(); - Annotation arg2 = rel.getArg2().getArgument(); - String arg1Content = argToId.get( arg1 ); - String arg2Content = argToId.get( arg2 ); - String relContent = rel.getCategory(); - annoElement.appendChild(addRelationElement(doc, id, relContent, arg1Content, arg2Content, outFile)); - id++; - } - }else{ - // need to keep track of which relations we've printed since they don't get grouped in the CAS - for(String key : rels.keySet()){ - String arg1Content = null; - String arg2Content = null; - StringBuffer buff = new StringBuffer(); - for(TemporalTextRelation probRel : rels.get(key)){ - buff.append(probRel.getCategory()); - buff.append(':'); - buff.append(probRel.getConfidence()); - buff.append("::"); - if(arg1Content == null){ - arg1Content = argToId.get(probRel.getArg1().getArgument()); - arg2Content = argToId.get(probRel.getArg2().getArgument()); - } - } - String relContent = buff.substring(0, buff.length()-2); - annoElement.appendChild(addRelationElement(doc, id, relContent, arg1Content, arg2Content, outFile)); - id++; - } - } - - rootElement.appendChild( infoElement ); - rootElement.appendChild( schema ); - rootElement.appendChild( annoElement ); - doc.appendChild( rootElement ); - - // boilerplate xml-writing code: - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); - transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); - transformer.setOutputProperty( OutputKeys.METHOD, "xml" ); - DOMSource source = new DOMSource( doc ); - StreamResult result = new StreamResult( new File( outDir, outFile + ".xml" ) ); - transformer.transform( source, result ); - } catch ( ParserConfigurationException e ) { - e.printStackTrace(); - throw new AnalysisEngineProcessException( e ); - } catch ( TransformerConfigurationException e ) { - e.printStackTrace(); - throw new AnalysisEngineProcessException( e ); - } catch ( TransformerException e ) { - e.printStackTrace(); - throw new AnalysisEngineProcessException( e ); - } catch (CASException e) { - e.printStackTrace(); - throw new AnalysisEngineProcessException( e ); - } - - } - - private static Element addRelationElement(Document doc, int id, String relContent, String arg1Content, String arg2Content, String outFile){ - Element linkEl = doc.createElement( "relation" ); - String linkID = id + "@r@" + outFile + "@system"; - - Element idE = doc.createElement( "id" ); - idE.setTextContent( linkID ); - Element typeE = doc.createElement( "type" ); - typeE.setTextContent( "TLINK" ); - Element parentTE = doc.createElement( "parentsType" ); - parentTE.setTextContent( "TemporalRelations" ); - //add properties - Element property = doc.createElement( "properties" ); - - Element sourceE = doc.createElement( "Source" ); - sourceE.setTextContent( arg1Content ); - Element relTypeE = doc.createElement( "Type" ); - - relTypeE.setTextContent( relContent ); - Element targetE = doc.createElement( "Target" ); - targetE.setTextContent( arg2Content ); - - property.appendChild( sourceE ); - property.appendChild( relTypeE ); - property.appendChild( targetE ); - - linkEl.appendChild( idE ); - linkEl.appendChild( typeE ); - linkEl.appendChild( parentTE ); - linkEl.appendChild( property ); - return linkEl; - } - - private static Map<Integer, List<EventMention>> getMentionIdMap(JCas jcas, JCas probView){ - HashMap<Integer, List<EventMention>> map = new HashMap<>(); - - for(EventMention mention : JCasUtil.select(jcas, EventMention.class)){ - List<EventMention> variations = new ArrayList<>(); - for(EventMention probMention : JCasUtil.select(probView, EventMention.class)){ - if(mention.getId() == probMention.getId()){ - variations.add(probMention); - } - } - map.put(mention.getId(), variations); - } - return map; - } - - private static Map<String, List<TemporalTextRelation>> getRelationIdMap(JCas probView){ - HashMap<String, List<TemporalTextRelation>> map = new HashMap<>(); - - for(TemporalTextRelation probRel : JCasUtil.select(probView, TemporalTextRelation.class)){ - String idStr = getRelationId(probRel); - if(!map.containsKey(idStr)){ - map.put(idStr, new ArrayList<TemporalTextRelation>()); - } - List<TemporalTextRelation> variations = map.get(idStr); - variations.add(probRel); - } - - return map; - } - } - public static String getRelationId(TemporalTextRelation rel){ - StringBuffer buffer = new StringBuffer(); - if(rel.getArg1().getArgument().getClass().getSimpleName().equals("EventMention")){ - buffer.append('e'); - }else{ - buffer.append('t'); - } - buffer.append(((IdentifiedAnnotation)rel.getArg1().getArgument()).getId()); - buffer.append(':'); - if(rel.getArg2().getArgument().getClass().getSimpleName().equals("EventMention")){ - buffer.append('e'); - }else{ - buffer.append('t'); - } - buffer.append(((IdentifiedAnnotation)rel.getArg2().getArgument()).getId()); - return buffer.toString(); - } + */ + return UriCollectionReader.getCollectionReaderFromFiles( collectedFiles ); + } + + protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception { + return this.xmiExists + ? this.getXMIReadingPreprocessorAggregateBuilder() + : this.getXMIWritingPreprocessorAggregateBuilder(); + } + + protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException { + AggregateBuilder aggregateBuilder = new AggregateBuilder(); + aggregateBuilder.add( UriToDocumentTextAnnotator.getDescription() ); + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + XMIReader.class, + XMIReader.PARAM_XMI_DIRECTORY, + this.xmiDirectory ) ); + return aggregateBuilder; + } + + protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder() + throws Exception { + AggregateBuilder aggregateBuilder = new AggregateBuilder(); + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( UriToDocumentTextAnnotatorCtakes.class ) ); + + // read manual annotations into gold view + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + ViewCreatorAnnotator.class, + ViewCreatorAnnotator.PARAM_VIEW_NAME, + GOLD_VIEW_NAME ) ); + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + ViewTextCopierAnnotator.class, + ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME, + CAS.NAME_DEFAULT_SOFA, + ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME, + GOLD_VIEW_NAME ) ); + switch ( this.xmlFormat ) { + case Anafora: + if(this.subcorpus == Subcorpus.DeepPhe){ + aggregateBuilder.add( + AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class, + THYMEAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY, + this.xmlDirectory, + THYMEAnaforaXMLReader.PARAM_ANAFORA_XML_SUFFIXES, + new String[]{} ), + CAS.NAME_DEFAULT_SOFA, + GOLD_VIEW_NAME ); + }else{ + aggregateBuilder.add( + THYMEAnaforaXMLReader.getDescription( this.xmlDirectory ), + CAS.NAME_DEFAULT_SOFA, + GOLD_VIEW_NAME ); + } + break; + case Knowtator: + aggregateBuilder.add( + THYMEKnowtatorXMLReader.getDescription( this.xmlDirectory ), + CAS.NAME_DEFAULT_SOFA, + GOLD_VIEW_NAME ); + break; + case I2B2: + aggregateBuilder.add( + I2B2TemporalXMLReader.getDescription( this.xmlDirectory ), + CAS.NAME_DEFAULT_SOFA, + GOLD_VIEW_NAME ); + break; + } + + // identify segments + if(this.subcorpus == Subcorpus.DeepPhe){ + aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PittHeaderAnnotator.class)); + }else{ + aggregateBuilder + .add( AnalysisEngineFactory.createEngineDescription( SegmentsFromBracketedSectionTagsAnnotator.class ) ); + } + // identify sentences + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + SentenceDetector.class, + SentenceDetector.SD_MODEL_FILE_PARAM, + "org/apache/ctakes/core/sentdetect/sd-med-model.zip" ) ); + // aggregateBuilder.add(SentenceDetectorAnnotator.getDescription(FileLocator.locateFile("org/apache/ctakes/core/sentdetect/model.jar").getPath())); + + // identify tokens + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TokenizerAnnotatorPTB.class ) ); + // merge some tokens + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ContextDependentTokenizerAnnotator.class ) ); + + // identify part-of-speech tags + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + POSTagger.class, + TypeSystemDescriptionFactory.createTypeSystemDescription(), + TypePrioritiesFactory.createTypePriorities( Segment.class, Sentence.class, BaseToken.class ), + POSTagger.POS_MODEL_FILE_PARAM, + "org/apache/ctakes/postagger/models/mayo-pos.zip" ) ); + + // identify chunks + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + Chunker.class, + Chunker.CHUNKER_MODEL_FILE_PARAM, + FileLocator.locateFile( "org/apache/ctakes/chunker/models/chunker-model.zip" ), + Chunker.CHUNKER_CREATOR_CLASS_PARAM, + DefaultChunkCreator.class ) ); + + // identify UMLS named entities + + // adjust NP in NP NP to span both + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + ChunkAdjuster.class, + ChunkAdjuster.PARAM_CHUNK_PATTERN, + new String[] { "NP", "NP" }, + ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, + 1 ) ); + // adjust NP in NP PP NP to span all three + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + ChunkAdjuster.class, + ChunkAdjuster.PARAM_CHUNK_PATTERN, + new String[] { "NP", "PP", "NP" }, + ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, + 2 ) ); + // add lookup windows for each NP + aggregateBuilder + .add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) ); + // maximize lookup windows + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + OverlapAnnotator.class, + "A_ObjectClass", + LookupWindowAnnotation.class, + "B_ObjectClass", + LookupWindowAnnotation.class, + "OverlapType", + "A_ENV_B", + "ActionType", + "DELETE", + "DeleteAction", + new String[] { "selector=B" } ) ); + // add UMLS on top of lookup windows + aggregateBuilder.add( DefaultJCasTermAnnotator.createAnnotatorDescription() ); + + aggregateBuilder.add( LvgAnnotator.createAnnotatorDescription() ); + + // add dependency parser + aggregateBuilder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() ); + + // add semantic role labeler + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ClearNLPSemanticRoleLabelerAE.class ) ); + + // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps + if ( this.treebankDirectory != null ) { + aggregateBuilder.add( THYMETreebankReader.getDescription( this.treebankDirectory ) ); + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TimexAnnotationCorrector.class ) ); + } else { + // add ctakes constituency parses to system view + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class, + ConstituencyParser.PARAM_MODEL_FILENAME, + "org/apache/ctakes/constituency/parser/models/thyme.bin" ) ); + } + // write out the CAS after all the above annotations + aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( + XMIWriter.class, + XMIWriter.PARAM_XMI_DIRECTORY, + this.xmiDirectory ) ); + + return aggregateBuilder; + } + + public static <T extends Annotation> List<T> selectExact( JCas jCas, Class<T> annotationClass, Segment segment ) { + List<T> annotations = Lists.newArrayList(); + for ( T annotation : JCasUtil.selectCovered( jCas, annotationClass, segment ) ) { + if ( annotation.getClass().equals( annotationClass ) ) { + annotations.add( annotation ); + } + } + return annotations; + } + + public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase { + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + for ( Chunk chunk : JCasUtil.select( jCas, Chunk.class ) ) { + if ( chunk.getChunkType().equals( "NP" ) ) { + new LookupWindowAnnotation( jCas, chunk.getBegin(), chunk.getEnd() ).addToIndexes(); + } + } + } + } + + public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase { + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + List<LookupWindowAnnotation> lws = new ArrayList<>( JCasUtil.select( jCas, LookupWindowAnnotation.class ) ); + // we'll navigate backwards so that as we delete things we shorten the list from the back + for ( int i = lws.size() - 2; i >= 0; i-- ) { + LookupWindowAnnotation lw1 = lws.get( i ); + LookupWindowAnnotation lw2 = lws.get( i + 1 ); + if ( lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd() ) { + /// lw1 envelops or encloses lw2 + lws.remove( i + 1 ); + lw2.removeFromIndexes(); + } + } + + } + + } + + public static class EntityMentionRemover extends JCasAnnotator_ImplBase { + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + for ( EntityMention mention : Lists.newArrayList( JCasUtil.select( jCas, EntityMention.class ) ) ) { + mention.removeFromIndexes(); + } + } + } + + public static class EventMentionRemover extends JCasAnnotator_ImplBase { + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + for ( EventMention mention : Lists.newArrayList( JCasUtil.select( jCas, EventMention.class ) ) ) { + mention.removeFromIndexes(); + } + } + } + + // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed + public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase { + private static Pattern SECTION_PATTERN = Pattern.compile( + "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])", + Pattern.DOTALL ); + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + boolean foundSections = false; + Matcher matcher = SECTION_PATTERN.matcher( jCas.getDocumentText() ); + while ( matcher.find() ) { + Segment segment = new Segment( jCas ); + segment.setBegin( matcher.start() + matcher.group( 1 ).length() ); + segment.setEnd( matcher.end() - matcher.group( 3 ).length() ); + segment.setId( matcher.group( 2 ) ); + segment.addToIndexes(); + foundSections = true; + } + if ( !foundSections ) { + Segment segment = new Segment( jCas ); + segment.setBegin( 0 ); + segment.setEnd( jCas.getDocumentText().length() ); + segment.setId( "SIMPLE_SEGMENT" ); + segment.addToIndexes(); + } + } + } + + /** + * Grabs the document time from the header + */ + public static class PittHeaderAnnotator extends JCasAnnotator_ImplBase { + + /** + * Grabs the document time from the header + * {@inheritDoc} + */ + @Override + public void process( final JCas jcas ) throws AnalysisEngineProcessException { + String docText = jcas.getDocumentText(); + int headerEnd = docText.indexOf("\n", docText.indexOf("[Report de-identified")); + Segment mainSegment = new Segment(jcas, headerEnd+1, docText.length()-1); + mainSegment.setId("SIMPLE_SEGMENT"); + mainSegment.addToIndexes(); + } + } + + static File getXMIFile( File xmiDirectory, File textFile ) { + String fileName = textFile.getName(); + if(!fileName.contains(".xmi")){ + fileName += ".xmi"; + } + return new File( xmiDirectory, fileName);// + ".xmi" + } + + static File getXMIFile( File xmiDirectory, JCas jCas ) throws AnalysisEngineProcessException { + return getXMIFile( xmiDirectory, new File( ViewUriUtil.getURI( jCas ).getPath() ) ); + } + + public static class XMIWriter extends JCasAnnotator_ImplBase { + + public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; + + @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory = true ) + private File xmiDirectory; + + @Override + public void initialize( UimaContext context ) throws ResourceInitializationException { + super.initialize( context ); + if ( !this.xmiDirectory.exists() ) { + this.xmiDirectory.mkdirs(); + } + } + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + File xmiFile = getXMIFile( this.xmiDirectory, jCas ); + try { + FileOutputStream outputStream = new FileOutputStream( xmiFile ); + try { + XmiCasSerializer serializer = new XmiCasSerializer( jCas.getTypeSystem() ); + ContentHandler handler = new XMLSerializer( outputStream, false ).getContentHandler(); + serializer.serialize( jCas.getCas(), handler ); + } finally { + outputStream.close(); + } + } catch ( SAXException e ) { + throw new AnalysisEngineProcessException( e ); + } catch ( IOException e ) { + throw new AnalysisEngineProcessException( e ); + } + } + } + + public static class XMIReader extends JCasAnnotator_ImplBase { + + public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; + + @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory = true ) + private File xmiDirectory; + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + File xmiFile = getXMIFile( this.xmiDirectory, jCas ); + try { + FileInputStream inputStream = new FileInputStream( xmiFile ); + try { + XmiCasDeserializer.deserialize( inputStream, jCas.getCas() ); + } finally { + inputStream.close(); + } + } catch ( SAXException e ) { + throw new AnalysisEngineProcessException( e ); + } catch ( IOException e ) { + throw new AnalysisEngineProcessException( e ); + } + } + } + + public static class TimexAnnotationCorrector extends JCasAnnotator_ImplBase { + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + JCas goldView, systemView; + try { + goldView = jCas.getView( GOLD_VIEW_NAME ); + systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA ); + } catch ( CASException e ) { + e.printStackTrace(); + throw new AnalysisEngineProcessException(); + } + for ( TimeMention mention : JCasUtil.select( goldView, TimeMention.class ) ) { + // for each time expression, get the treebank node with the same span. + List<TreebankNode> nodes = JCasUtil.selectCovered( systemView, TreebankNode.class, mention ); + TreebankNode sameSpanNode = null; + for ( TreebankNode node : nodes ) { + if ( node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd() ) { + sameSpanNode = node; + break; + } + } + if ( sameSpanNode != null ) { + // look at node at the position of the timex3. + if ( sameSpanNode.getNodeType().equals( "PP" ) ) { + // if it is a PP it should be moved down to the NP + int numChildren = sameSpanNode.getChildren().size(); + if ( numChildren == 2 && sameSpanNode.getChildren( 0 ).getNodeType().equals( "IN" ) && + sameSpanNode.getChildren( 1 ).getNodeType().equals( "NP" ) ) { + // move the time span to this node: + TreebankNode mentionNode = sameSpanNode.getChildren( numChildren - 1 ); + mention.setBegin( mentionNode.getBegin() ); + mention.setEnd( mentionNode.getEnd() ); + } + } + } else { + // if there is no matching tree span, see if the DT to the left would help. + // now adjust for missing DT to the left + List<TerminalTreebankNode> precedingPreterms = JCasUtil + .selectPreceding( systemView, TerminalTreebankNode.class, mention, 1 ); + if ( precedingPreterms != null && precedingPreterms.size() == 1 ) { + TerminalTreebankNode leftTerm = precedingPreterms.get( 0 ); + if ( leftTerm.getNodeType().equals( "DT" ) ) { + // now see if adding this would make it match a tree + List<TreebankNode> matchingNodes = JCasUtil + .selectCovered( systemView, TreebankNode.class, leftTerm.getBegin(), mention.getEnd() ); + for ( TreebankNode node : matchingNodes ) { + if ( node.getBegin() == leftTerm.getBegin() && node.getEnd() == mention.getEnd() ) { + sameSpanNode = node; + break; + } + } + if ( sameSpanNode != null ) { + // adding the DT to the left of th emention made it match a tree: + System.err.println( + "Adding DT: " + leftTerm.getCoveredText() + " to TIMEX: " + mention.getCoveredText() ); + mention.setBegin( leftTerm.getBegin() ); + } + } + } + } + } + } + } + + + public static class CopyFromGold extends JCasAnnotator_ImplBase { + + public static AnalysisEngineDescription getDescription( Class<?>... classes ) + throws ResourceInitializationException { + return AnalysisEngineFactory.createEngineDescription( + CopyFromGold.class, + CopyFromGold.PARAM_ANNOTATION_CLASSES, + classes ); + } + + public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses"; + + @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, mandatory = true ) + private Class<? extends TOP>[] annotationClasses; + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + JCas goldView, systemView; + try { + goldView = jCas.getView( GOLD_VIEW_NAME ); + systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA ); + } catch ( CASException e ) { + throw new AnalysisEngineProcessException( e ); + } + for ( Class<? extends TOP> annotationClass : this.annotationClasses ) { + for ( TOP annotation : Lists.newArrayList( JCasUtil.select( systemView, annotationClass ) ) ) { + if ( annotation.getClass().equals( annotationClass ) ) { + annotation.removeFromIndexes(); + } + } + } + CasCopier copier = new CasCopier( goldView.getCas(), systemView.getCas() ); + Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName( CAS.FEATURE_FULL_NAME_SOFA ); + for ( Class<? extends TOP> annotationClass : this.annotationClasses ) { + for ( TOP annotation : JCasUtil.select( goldView, annotationClass ) ) { + TOP copy = (TOP)copier.copyFs( annotation ); + if ( copy instanceof Annotation ) { + copy.setFeatureValue( sofaFeature, systemView.getSofa() ); + } + copy.addToIndexes( systemView ); + } + } + } + } + + public static class CopyFromSystem extends JCasAnnotator_ImplBase { + + public static AnalysisEngineDescription getDescription( Class<?>... classes ) + throws ResourceInitializationException { + return AnalysisEngineFactory.createEngineDescription( + CopyFromSystem.class, + CopyFromSystem.PARAM_ANNOTATION_CLASSES, + classes ); + } + + public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses"; + + @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, mandatory = true ) + private Class<? extends TOP>[] annotationClasses; + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + JCas goldView, systemView; + try { + goldView = jCas.getView( GOLD_VIEW_NAME ); + systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA ); + } catch ( CASException e ) { + throw new AnalysisEngineProcessException( e ); + } + for ( Class<? extends TOP> annotationClass : this.annotationClasses ) { + for ( TOP annotation : Lists.newArrayList( JCasUtil.select( goldView, annotationClass ) ) ) { + if ( annotation.getClass().equals( annotationClass ) ) { + annotation.removeFromIndexes(); + } + } + } + CasCopier copier = new CasCopier( systemView.getCas(), goldView.getCas() ); + Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName( CAS.FEATURE_FULL_NAME_SOFA ); + for ( Class<? extends TOP> annotationClass : this.annotationClasses ) { + for ( TOP annotation : JCasUtil.select( systemView, annotationClass ) ) { + TOP copy = (TOP)copier.copyFs( annotation ); + if ( copy instanceof Annotation ) { + copy.setFeatureValue( sofaFeature, goldView.getSofa() ); + } + copy.addToIndexes( goldView ); + } + } + } + } + + /* + * The following class overrides a ClearTK utility annotator class for reading + * a text file into a JCas. The code is copy/pasted so that one tiny modification + * can be made for this corpus -- replace a single odd character (0xc) with a + * space since it trips up xml output. + */ + public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator { + + @Override + public void process( JCas jCas ) throws AnalysisEngineProcessException { + URI uri = ViewUriUtil.getURI( jCas ); + String content; + + try { + content = CharStreams.toString( new InputStreamReader( uri.toURL().openStream() ) ); + content = content.replace( (char)0xc, ' ' ); + jCas.setSofaDataString( content, "text/plain" ); + } catch ( MalformedURLException e ) { + throw new AnalysisEngineProcessException( e ); + } catch ( IOException e ) { + throw new AnalysisEngineProcessException( e ); + } + } + } + + public static class WriteI2B2XML extends JCasAnnotator_ImplBase { + public static final String PARAM_OUTPUT_DIR = "PARAM_OUTPUT_DIR"; + @ConfigurationParameter( mandatory = true, description = "Output directory to write xml files to.", name = PARAM_OUTPUT_DIR ) + protected String outputDir; + + @Override + public void process( JCas jcas ) throws AnalysisEngineProcessException { + try { + // get the output file name from the input file name and output directory. + File outDir = new File( outputDir ); + if ( !outDir.exists() ) { + outDir.mkdirs(); + } + File inFile = new File( ViewUriUtil.getURI( jcas ) ); + String outFile = inFile.getName().replace( ".txt", "" ); + + // build the xml + DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); + Document doc = docBuilder.newDocument(); + Element rootElement = doc.createElement( "ClinicalNarrativeTemporalAnnotation" ); + Element textElement = doc.createElement( "TEXT" ); + Element tagsElement = doc.createElement( "TAGS" ); + textElement.setTextContent( jcas.getDocumentText() ); + rootElement.appendChild( textElement ); + rootElement.appendChild( tagsElement ); + doc.appendChild( rootElement ); + + Map<IdentifiedAnnotation, String> argToId = new HashMap<>(); + int id = 0; + for ( TimeMention timex : JCasUtil.select( jcas, TimeMention.class ) ) { + Element timexElement = doc.createElement( "TIMEX3" ); + String timexID = "T" + id; + id++; + argToId.put( timex, timexID ); + timexElement.setAttribute( "id", timexID ); + timexElement.setAttribute( "start", String.valueOf( timex.getBegin() + 1 ) ); + timexElement.setAttribute( "end", String.valueOf( timex.getEnd() + 1 ) ); + timexElement.setAttribute( "text", timex.getCoveredText() ); + timexElement.setAttribute( "type", "NA" ); + timexElement.setAttribute( "val", "NA" ); + timexElement.setAttribute( "mod", "NA" ); + tagsElement.appendChild( timexElement ); + } + + id = 0; + for ( EventMention event : JCasUtil.select( jcas, EventMention.class ) ) { + if ( event.getClass().equals( EventMention.class ) ) { + // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events + Element eventEl = doc.createElement( "EVENT" ); + String eventID = "E" + id; + id++; + argToId.put( event, eventID ); + eventEl.setAttribute( "id", eventID ); + eventEl.setAttribute( "start", String.valueOf( event.getBegin() + 1 ) ); + eventEl.setAttribute( "end", String.valueOf( event.getEnd() + 1 ) ); + eventEl.setAttribute( "text", event.getCoveredText() ); + eventEl.setAttribute( "modality", "NA" ); + eventEl.setAttribute( "polarity", "NA" ); + eventEl.setAttribute( "type", "NA" ); + tagsElement.appendChild( eventEl ); + } + } + + id = 0; + for ( TemporalTextRelation rel : JCasUtil.select( jcas, TemporalTextRelation.class ) ) { + Element linkEl = doc.createElement( "TLINK" ); + String linkID = "TL" + id; + id++; + linkEl.setAttribute( "id", linkID ); + Annotation arg1 = rel.getArg1().getArgument(); + linkEl.setAttribute( "fromID", argToId.get( arg1 ) ); + linkEl.setAttribute( "fromText", arg1.getCoveredText() ); + Annotation arg2 = rel.getArg2().getArgument(); + if ( arg2 != null ) { + linkEl.setAttribute( "toID", argToId.get( arg2 ) ); + linkEl.setAttribute( "toText", arg2.getCoveredText() ); + } else { + linkEl.setAttribute( "toID", "Discharge" ); + linkEl.setAttribute( "toText", "Discharge" ); + } + linkEl.setAttribute( "type", rel.getCategory() ); + tagsElement.appendChild( linkEl ); + } + + // boilerplate xml-writing code: + TransformerFactory transformerFactory = TransformerFactory.newInstance(); + Transformer transformer = transformerFactory.newTransformer(); + transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.setOutputProperty( OutputKeys.METHOD, "xml" ); + DOMSource source = new DOMSource( doc ); + StreamResult result = new StreamResult( new File( outputDir, outFile ) ); + transformer.transform( source, result ); + } catch ( ParserConfigurationException e ) { + e.printStackTrace(); + throw new AnalysisEngineProcessException( e ); + } catch ( TransformerConfigurationException e ) { + e.printStackTrace(); + throw new AnalysisEngineProcessException( e ); + } catch ( TransformerException e ) { + e.printStackTrace(); + throw new AnalysisEngineProcessException( e ); + } + + } + + } + + public static class WriteAnaforaXML extends JCasAnnotator_ImplBase { + public static final String PARAM_OUTPUT_DIR = "PARAM_OUTPUT_DIR";
[... 303 lines stripped ...]