Evaluation_ImplBase.java

clin Tue, 18 Oct 2016 11:58:29 -0700

Modified: 
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: 
http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1765496&r1=1765495&r2=1765496&view=diff
==============================================================================
--- 
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
 (original)
+++ 
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
 Tue Oct 18 18:57:29 2016
@@ -109,297 +109,297 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 public abstract class Evaluation_ImplBase<STATISTICS_TYPE> extends
-                                                           
org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
+org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
 
-   private static Logger LOGGER = Logger.getLogger( Evaluation_ImplBase.class 
);
+       static Logger LOGGER = Logger.getLogger( Evaluation_ImplBase.class );
 
-   private static final String LOOKUP_PATH = 
"/org/apache/ctakes/temporal/badEEContainNotes.txt";
+       private static final String LOOKUP_PATH = 
"/org/apache/ctakes/temporal/badEEContainNotes.txt";
 
-   private static boolean isTraining;
+       private static boolean isTraining;
 
-   public static HashSet<String> badNotes;
-
-   public static final String GOLD_VIEW_NAME = "GoldView";
-
-   public static final String PROB_VIEW_NAME = "ProbView";
-   
-   public enum XMLFormat {Knowtator, Anafora, I2B2}
-
-   public enum Subcorpus {Colon, Brain, DeepPhe}
-
-   public static interface Options {
-
-      @Option( longName = "text", defaultToNull = true )
-      public File getRawTextDirectory();
-
-      @Option( longName = "xml" )
-      public File getXMLDirectory();
-
-      @Option( longName = "format", defaultValue = "Anafora" )
-      public XMLFormat getXMLFormat();
-
-      @Option( longName = "subcorpus", defaultValue = "Colon" )
-      public Subcorpus getSubcorpus();
-
-      @Option( longName = "xmi" )
-      public File getXMIDirectory();
-
-      @Option( longName = "patients" )
-      public CommandLine.IntegerRanges getPatients();
-
-//      @Option( longName = "train-remainders", defaultValue = "0-2" )
-//      public CommandLine.IntegerRanges getTrainRemainders();
-//
-//      @Option( longName = "dev-remainders", defaultValue = "3" )
-//      public CommandLine.IntegerRanges getDevRemainders();
-//
-//      @Option( longName = "test-remainders", defaultValue = "4-5" )
-//      public CommandLine.IntegerRanges getTestRemainders();
-      
-      @Option( longName = "train-remainders", defaultValue = "0-3" )
-      public CommandLine.IntegerRanges getTrainRemainders();
-
-      @Option( longName = "dev-remainders", defaultValue = "4-5" )
-      public CommandLine.IntegerRanges getDevRemainders();
-
-      @Option( longName = "test-remainders", defaultValue = "6-7" )
-      public CommandLine.IntegerRanges getTestRemainders();
-
-      @Option( longName = "treebank", defaultToNull = true )
-      public File getTreebankDirectory();
-
-      @Option
-      public boolean getUseGoldTrees();
-
-      @Option
-      public boolean getGrid();
-
-      @Option
-      public boolean getPrintErrors();
-
-      @Option
-      public boolean getPrintOverlappingSpans();
-
-      @Option
-      public boolean getTest();
-
-      @Option( longName = "kernelParams", defaultToNull = true )
-      public String getKernelParams();
-
-      @Option( defaultToNull = true )
-      public String getI2B2Output();
-      
-      @Option( defaultToNull = true )
-      public String getAnaforaOutput();
-
-      @Option
-      public boolean getSkipTrain();
-      
-      @Option(longName = "skipWrite")
-      public boolean getSkipDataWriting();
-   }
-
-   public static List<Integer> getTrainItems( Options options ) {
-      List<Integer> patientSets = options.getPatients().getList();
-      List<Integer> trainItems = THYMEData.getPatientSets( patientSets, 
options.getTrainRemainders().getList() );
-      if ( options.getTest() ) {
-         trainItems.addAll( THYMEData.getPatientSets( patientSets, 
options.getDevRemainders().getList() ) );
-      }
-      return trainItems;
-   }
-
-   public static List<Integer> getTestItems( Options options ) {
-      List<Integer> patientSets = options.getPatients().getList();
-      List<Integer> testItems;
-      if ( options.getTest() ) {
-         testItems = THYMEData.getPatientSets( patientSets, 
options.getTestRemainders().getList() );
-      } else {
-         testItems = THYMEData.getPatientSets( patientSets, 
options.getDevRemainders().getList() );
-      }
-      return testItems;
-   }
-
-   protected File rawTextDirectory;
-
-   protected File xmlDirectory;
-
-   protected XMLFormat xmlFormat;
-
-   protected Subcorpus subcorpus;
-
-   protected File xmiDirectory;
-
-   private boolean xmiExists;
-
-   protected File treebankDirectory;
-
-   protected boolean printErrors = false;
-
-   protected boolean printOverlapping = false;
-
-   protected String i2b2Output = null;
-
-   protected String anaforaOutput = null; 
-   
-   protected String[] kernelParams;
-
-   public Evaluation_ImplBase(
-         File baseDirectory,
-         File rawTextDirectory,
-         File xmlDirectory,
-         XMLFormat xmlFormat,
-         Subcorpus subcorpus,
-         File xmiDirectory,
-         File treebankDirectory ) {
-      super( baseDirectory );
-      this.rawTextDirectory = rawTextDirectory;
-      this.xmlDirectory = xmlDirectory;
-      this.xmlFormat = xmlFormat;
-      this.subcorpus = subcorpus;
-      this.xmiDirectory = xmiDirectory;
-      this.xmiExists = this.xmiDirectory.exists() && 
this.xmiDirectory.listFiles().length > 0;
-      this.treebankDirectory = treebankDirectory;
-
-      this.isTraining = true;
-      this.badNotes = new HashSet<>();
-      URL url = TimeWordsExtractor.class.getResource( LOOKUP_PATH );
-      try ( BufferedReader br = new BufferedReader( new FileReader( 
url.getFile() ) ) ) {
-         String line;
-         while ( (line = br.readLine()) != null ) {
-            badNotes.add( line.trim() );
-         }
-      } catch ( FileNotFoundException e ) {
-         // TODO Auto-generated catch block
-         e.printStackTrace();
-      } catch ( IOException e ) {
-         // TODO Auto-generated catch block
-         e.printStackTrace();
-      }
-   }
-
-   public void setI2B2Output( String outDir ) {
-      i2b2Output = outDir;
-   }
-
-   public void prepareXMIsFor( List<Integer> patientSets ) throws Exception {
-      boolean needsXMIs = false;
-      for ( File textFile : this.getFilesFor( patientSets ) ) {
-         if ( !getXMIFile( this.xmiDirectory, textFile ).exists() ) {
-            needsXMIs = true;
-            break;
-         }
-      }
-      if ( needsXMIs ) {
-         CollectionReader reader = this.getCollectionReader( patientSets );
-         AnalysisEngine engine = 
this.getXMIWritingPreprocessorAggregateBuilder().createAggregate();
-         SimplePipeline.runPipeline( reader, engine );
-      }
-      this.xmiExists = true;
-   }
-
-   private List<File> getFilesFor( List<Integer> patientSets ) throws 
FileNotFoundException {
-      List<File> files = new ArrayList<>();
-      if ( this.xmlFormat == XMLFormat.Anafora ) {
-         Set<String> ids = new HashSet<>();
-         for ( Integer set : patientSets ) {
-            if ( this.subcorpus == Subcorpus.Colon ) {
-               ids.add( String.format( "ID%03d", set ) );
-            } else if ( this.subcorpus == Subcorpus.DeepPhe ) {
-               ids.add( String.format( "patient%02d", set ) );
-            } else {
-               ids.add( String.format( "doc%04d", set ) );
-            }
-         }
-         int filePrefixLen = 5; // Colon: "ID\d{3}"
-         if ( this.subcorpus == Subcorpus.Brain ) {
-            filePrefixLen = 7; // Brain: "doc\d{4}"
-         } else if ( this.subcorpus == Subcorpus.DeepPhe ) {
-            filePrefixLen = 9; // deepPhe: "patient\d{2}"
-         }
-         if ( this.subcorpus == Subcorpus.DeepPhe ) {
-            for ( File dir : this.xmlDirectory.listFiles() ) {
-               if ( dir.isDirectory() ) {
-                  if ( ids.contains( dir.getName().substring( 0, filePrefixLen 
) ) ) {
-                     File file = new File( dir, dir.getName() );
-                     if ( file.exists() ) {
-                        files.add( file );
-                     } else {
-                        LOGGER.warn( "Missing note: " + file );
-                     }
-                  }
-               }
-            }
-         } else {
-            for ( String section : THYMEData.SECTIONS ) {
-               File xmlSubdir = new File( this.xmlDirectory, section );
-               for ( File dir : xmlSubdir.listFiles() ) {
-                  if ( dir.isDirectory() ) {
-                     if ( ids.contains( dir.getName().substring( 0, 
filePrefixLen ) ) ) {
-                        File file = new File( dir, dir.getName() );
-                        if ( file.exists() ) {
-                           files.add( file );
-                        } else {
-                           LOGGER.warn( "Missing note: " + file );
-                        }
-                     }
-                  }
-               }
-            }
-         }
-      } else if ( this.xmlFormat == XMLFormat.I2B2 ) {
-         File trainDir = new File( this.xmlDirectory, "training" );
-         File testDir = new File( this.xmlDirectory, "test" );
-         for ( Integer pt : patientSets ) {
-            File xmlTrain = new File( trainDir, pt + ".xml" );
-            File train = new File( trainDir, pt + ".xml.txt" );
-            if ( train.exists() ) {
-               if ( xmlTrain.exists() ) {
-                  files.add( train );
-               } else {
-                  System.err.println( "Text file in training has no 
corresponding xml -- skipping: " + train );
-               }
-            }
-            File xmlTest = new File( testDir, pt + ".xml" );
-            File test = new File( testDir, pt + ".xml.txt" );
-            if ( xmlTest.exists() ) {
-               if ( test.exists() ) {
-                  files.add( test );
-               } else {
-                  throw new FileNotFoundException( "Could not find the test 
text file -- for cTAKES usage you must copy the text files into the xml 
directory for the test set." );
-               }
-            }
-            assert !(train.exists() && test.exists());
-         }
-      } else if ( xmlFormat == XMLFormat.Knowtator ) {
-         LOGGER.warn( "This is an old annotation format -- please upgrade to 
using anafora files." );
-         for ( Integer set : patientSets ) {
-            final int setNum = set;
-            for ( File file : rawTextDirectory.listFiles( new FilenameFilter() 
{
-               @Override
-               public boolean accept( File dir, String name ) {
-                  return name.contains( String.format( "ID%03d", setNum ) );
-               }
-            } ) ) {
-               // skip hidden files like .svn
-               if ( !file.isHidden() ) {
-                  files.add( file );
-               }
-            }
-         }
-      } else {
-         LOGGER.error( "Unknown data format -- please specify Anafora, i2b2, 
or Knowtator format." );
-      }
-      return files;
-   }
-
-   @Override
-   protected CollectionReader getCollectionReader( List<Integer> patientSets ) 
throws Exception {
-      List<File> collectedFiles = this.getFilesFor( patientSets );
-      Collections.sort(collectedFiles);
-//      for(File file : collectedFiles){
-//       System.err.println(file.getName());
-//      }
-      /**
+       public static HashSet<String> badNotes;
+
+       public static final String GOLD_VIEW_NAME = "GoldView";
+
+       public static final String PROB_VIEW_NAME = "ProbView";
+
+       public enum XMLFormat {Knowtator, Anafora, I2B2}
+
+       public enum Subcorpus {Colon, Brain, DeepPhe}
+
+       public static interface Options {
+
+               @Option( longName = "text", defaultToNull = true )
+               public File getRawTextDirectory();
+
+               @Option( longName = "xml" )
+               public File getXMLDirectory();
+
+               @Option( longName = "format", defaultValue = "Anafora" )
+               public XMLFormat getXMLFormat();
+
+               @Option( longName = "subcorpus", defaultValue = "Colon" )
+               public Subcorpus getSubcorpus();
+
+               @Option( longName = "xmi" )
+               public File getXMIDirectory();
+
+               @Option( longName = "patients" )
+               public CommandLine.IntegerRanges getPatients();
+
+               //      @Option( longName = "train-remainders", defaultValue = 
"0-2" )
+               //      public CommandLine.IntegerRanges getTrainRemainders();
+               //
+               //      @Option( longName = "dev-remainders", defaultValue = 
"3" )
+               //      public CommandLine.IntegerRanges getDevRemainders();
+               //
+               //      @Option( longName = "test-remainders", defaultValue = 
"4-5" )
+               //      public CommandLine.IntegerRanges getTestRemainders();
+
+               @Option( longName = "train-remainders", defaultValue = "0-3" )
+               public CommandLine.IntegerRanges getTrainRemainders();
+
+               @Option( longName = "dev-remainders", defaultValue = "4-5" )
+               public CommandLine.IntegerRanges getDevRemainders();
+
+               @Option( longName = "test-remainders", defaultValue = "6-7" )
+               public CommandLine.IntegerRanges getTestRemainders();
+
+               @Option( longName = "treebank", defaultToNull = true )
+               public File getTreebankDirectory();
+
+               @Option
+               public boolean getUseGoldTrees();
+
+               @Option
+               public boolean getGrid();
+
+               @Option
+               public boolean getPrintErrors();
+
+               @Option
+               public boolean getPrintOverlappingSpans();
+
+               @Option
+               public boolean getTest();
+
+               @Option( longName = "kernelParams", defaultToNull = true )
+               public String getKernelParams();
+
+               @Option( defaultToNull = true )
+               public String getI2B2Output();
+
+               @Option( defaultToNull = true )
+               public String getAnaforaOutput();
+
+               @Option
+               public boolean getSkipTrain();
+
+               @Option(longName = "skipWrite")
+               public boolean getSkipDataWriting();
+       }
+
+       public static List<Integer> getTrainItems( Options options ) {
+               List<Integer> patientSets = options.getPatients().getList();
+               List<Integer> trainItems = THYMEData.getPatientSets( 
patientSets, options.getTrainRemainders().getList() );
+               if ( options.getTest() ) {
+                       trainItems.addAll( THYMEData.getPatientSets( 
patientSets, options.getDevRemainders().getList() ) );
+               }
+               return trainItems;
+       }
+
+       public static List<Integer> getTestItems( Options options ) {
+               List<Integer> patientSets = options.getPatients().getList();
+               List<Integer> testItems;
+               if ( options.getTest() ) {
+                       testItems = THYMEData.getPatientSets( patientSets, 
options.getTestRemainders().getList() );
+               } else {
+                       testItems = THYMEData.getPatientSets( patientSets, 
options.getDevRemainders().getList() );
+               }
+               return testItems;
+       }
+
+       protected File rawTextDirectory;
+
+       protected File xmlDirectory;
+
+       protected XMLFormat xmlFormat;
+
+       protected Subcorpus subcorpus;
+
+       protected File xmiDirectory;
+
+       private boolean xmiExists;
+
+       protected File treebankDirectory;
+
+       protected boolean printErrors = false;
+
+       protected boolean printOverlapping = false;
+
+       protected String i2b2Output = null;
+
+       protected String anaforaOutput = null; 
+
+       protected String[] kernelParams;
+
+       public Evaluation_ImplBase(
+                       File baseDirectory,
+                       File rawTextDirectory,
+                       File xmlDirectory,
+                       XMLFormat xmlFormat,
+                       Subcorpus subcorpus,
+                       File xmiDirectory,
+                       File treebankDirectory ) {
+               super( baseDirectory );
+               this.rawTextDirectory = rawTextDirectory;
+               this.xmlDirectory = xmlDirectory;
+               this.xmlFormat = xmlFormat;
+               this.subcorpus = subcorpus;
+               this.xmiDirectory = xmiDirectory;
+               this.xmiExists = this.xmiDirectory.exists() && 
this.xmiDirectory.listFiles().length > 0;
+               this.treebankDirectory = treebankDirectory;
+
+               this.isTraining = true;
+               this.badNotes = new HashSet<>();
+               URL url = TimeWordsExtractor.class.getResource( LOOKUP_PATH );
+               try ( BufferedReader br = new BufferedReader( new FileReader( 
url.getFile() ) ) ) {
+                       String line;
+                       while ( (line = br.readLine()) != null ) {
+                               badNotes.add( line.trim() );
+                       }
+               } catch ( FileNotFoundException e ) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch ( IOException e ) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+       }
+
+       public void setI2B2Output( String outDir ) {
+               i2b2Output = outDir;
+       }
+
+       public void prepareXMIsFor( List<Integer> patientSets ) throws 
Exception {
+               boolean needsXMIs = false;
+               for ( File textFile : this.getFilesFor( patientSets ) ) {
+                       if ( !getXMIFile( this.xmiDirectory, textFile 
).exists() ) {
+                               needsXMIs = true;
+                               break;
+                       }
+               }
+               if ( needsXMIs ) {
+                       CollectionReader reader = this.getCollectionReader( 
patientSets );
+                       AnalysisEngine engine = 
this.getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+                       SimplePipeline.runPipeline( reader, engine );
+               }
+               this.xmiExists = true;
+       }
+
+       private List<File> getFilesFor( List<Integer> patientSets ) throws 
FileNotFoundException {
+               List<File> files = new ArrayList<>();
+               if ( this.xmlFormat == XMLFormat.Anafora ) {
+                       Set<String> ids = new HashSet<>();
+                       for ( Integer set : patientSets ) {
+                               if ( this.subcorpus == Subcorpus.Colon ) {
+                                       ids.add( String.format( "ID%03d", set ) 
);
+                               } else if ( this.subcorpus == Subcorpus.DeepPhe 
) {
+                                       ids.add( String.format( "patient%02d", 
set ) );
+                               } else {
+                                       ids.add( String.format( "doc%04d", set 
) );
+                               }
+                       }
+                       int filePrefixLen = 5; // Colon: "ID\d{3}"
+                       if ( this.subcorpus == Subcorpus.Brain ) {
+                               filePrefixLen = 7; // Brain: "doc\d{4}"
+                       } else if ( this.subcorpus == Subcorpus.DeepPhe ) {
+                               filePrefixLen = 9; // deepPhe: "patient\d{2}"
+                       }
+                       if ( this.subcorpus == Subcorpus.DeepPhe ) {
+                               for ( File dir : this.xmlDirectory.listFiles() 
) {
+                                       if ( dir.isDirectory() ) {
+                                               if ( ids.contains( 
dir.getName().substring( 0, filePrefixLen ) ) ) {
+                                                       File file = new File( 
dir, dir.getName() );
+                                                       if ( file.exists() ) {
+                                                               files.add( file 
);
+                                                       } else {
+                                                               LOGGER.warn( 
"Missing note: " + file );
+                                                       }
+                                               }
+                                       }
+                               }
+                       } else {
+                               for ( String section : THYMEData.SECTIONS ) {
+                                       File xmlSubdir = new File( 
this.xmlDirectory, section );
+                                       for ( File dir : xmlSubdir.listFiles() 
) {
+                                               if ( dir.isDirectory() ) {
+                                                       if ( ids.contains( 
dir.getName().substring( 0, filePrefixLen ) ) ) {
+                                                               File file = new 
File( dir, dir.getName() );
+                                                               if ( 
file.exists() ) {
+                                                                       
files.add( file );
+                                                               } else {
+                                                                       
LOGGER.warn( "Missing note: " + file );
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               } else if ( this.xmlFormat == XMLFormat.I2B2 ) {
+                       File trainDir = new File( this.xmlDirectory, "training" 
);
+                       File testDir = new File( this.xmlDirectory, "test" );
+                       for ( Integer pt : patientSets ) {
+                               File xmlTrain = new File( trainDir, pt + ".xml" 
);
+                               File train = new File( trainDir, pt + 
".xml.txt" );
+                               if ( train.exists() ) {
+                                       if ( xmlTrain.exists() ) {
+                                               files.add( train );
+                                       } else {
+                                               System.err.println( "Text file 
in training has no corresponding xml -- skipping: " + train );
+                                       }
+                               }
+                               File xmlTest = new File( testDir, pt + ".xml" );
+                               File test = new File( testDir, pt + ".xml.txt" 
);
+                               if ( xmlTest.exists() ) {
+                                       if ( test.exists() ) {
+                                               files.add( test );
+                                       } else {
+                                               throw new 
FileNotFoundException( "Could not find the test text file -- for cTAKES usage 
you must copy the text files into the xml directory for the test set." );
+                                       }
+                               }
+                               assert !(train.exists() && test.exists());
+                       }
+               } else if ( xmlFormat == XMLFormat.Knowtator ) {
+                       LOGGER.warn( "This is an old annotation format -- 
please upgrade to using anafora files." );
+                       for ( Integer set : patientSets ) {
+                               final int setNum = set;
+                               for ( File file : rawTextDirectory.listFiles( 
new FilenameFilter() {
+                                       @Override
+                                       public boolean accept( File dir, String 
name ) {
+                                               return name.contains( 
String.format( "ID%03d", setNum ) );
+                                       }
+                               } ) ) {
+                                       // skip hidden files like .svn
+                                       if ( !file.isHidden() ) {
+                                               files.add( file );
+                                       }
+                               }
+                       }
+               } else {
+                       LOGGER.error( "Unknown data format -- please specify 
Anafora, i2b2, or Knowtator format." );
+               }
+               return files;
+       }
+
+       @Override
+       protected CollectionReader getCollectionReader( List<Integer> 
patientSets ) throws Exception {
+               List<File> collectedFiles = this.getFilesFor( patientSets );
+               Collections.sort(collectedFiles);
+               //      for(File file : collectedFiles){
+               //        System.err.println(file.getName());
+               //      }
+               /**
        if(isTraining){
        final Collection<File> filesToRemove = new HashSet<>();
        for ( File xmiFile : collectedFiles ) {
@@ -412,938 +412,940 @@ public abstract class Evaluation_ImplBas
        collectedFiles.removeAll( filesToRemove );
        }
        isTraining = false;
-       */
-      return UriCollectionReader.getCollectionReaderFromFiles( collectedFiles 
);
-   }
-
-   protected AggregateBuilder getPreprocessorAggregateBuilder() throws 
Exception {
-      return this.xmiExists
-             ? this.getXMIReadingPreprocessorAggregateBuilder()
-             : this.getXMIWritingPreprocessorAggregateBuilder();
-   }
-
-   protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() 
throws UIMAException {
-      AggregateBuilder aggregateBuilder = new AggregateBuilder();
-      aggregateBuilder.add( UriToDocumentTextAnnotator.getDescription() );
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            XMIReader.class,
-            XMIReader.PARAM_XMI_DIRECTORY,
-            this.xmiDirectory ) );
-      return aggregateBuilder;
-   }
-
-   protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
-         throws Exception {
-      AggregateBuilder aggregateBuilder = new AggregateBuilder();
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( 
UriToDocumentTextAnnotatorCtakes.class ) );
-
-      // read manual annotations into gold view
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            ViewCreatorAnnotator.class,
-            ViewCreatorAnnotator.PARAM_VIEW_NAME,
-            GOLD_VIEW_NAME ) );
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            ViewTextCopierAnnotator.class,
-            ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
-            CAS.NAME_DEFAULT_SOFA,
-            ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
-            GOLD_VIEW_NAME ) );
-      switch ( this.xmlFormat ) {
-         case Anafora:
-           if(this.subcorpus == Subcorpus.DeepPhe){
-            aggregateBuilder.add(
-                  
AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class,
-                      THYMEAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY,
-                      this.xmlDirectory,
-                      THYMEAnaforaXMLReader.PARAM_ANAFORA_XML_SUFFIXES,
-                      new String[]{} ),
-                  CAS.NAME_DEFAULT_SOFA,
-                  GOLD_VIEW_NAME );
-           }else{
-            aggregateBuilder.add(
-                  THYMEAnaforaXMLReader.getDescription( this.xmlDirectory ),
-                  CAS.NAME_DEFAULT_SOFA,
-                  GOLD_VIEW_NAME );
-           }
-            break;
-         case Knowtator:
-            aggregateBuilder.add(
-                  THYMEKnowtatorXMLReader.getDescription( this.xmlDirectory ),
-                  CAS.NAME_DEFAULT_SOFA,
-                  GOLD_VIEW_NAME );
-            break;
-         case I2B2:
-            aggregateBuilder.add(
-                  I2B2TemporalXMLReader.getDescription( this.xmlDirectory ),
-                  CAS.NAME_DEFAULT_SOFA,
-                  GOLD_VIEW_NAME );
-            break;
-      }
-
-      // identify segments
-      if(this.subcorpus == Subcorpus.DeepPhe){
-        
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PittHeaderAnnotator.class));
-      }else{
-        aggregateBuilder
-        .add( AnalysisEngineFactory.createEngineDescription( 
SegmentsFromBracketedSectionTagsAnnotator.class ) );
-      }
-      // identify sentences
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            SentenceDetector.class,
-            SentenceDetector.SD_MODEL_FILE_PARAM,
-            "org/apache/ctakes/core/sentdetect/sd-med-model.zip" ) );
-//      
aggregateBuilder.add(SentenceDetectorAnnotator.getDescription(FileLocator.locateFile("org/apache/ctakes/core/sentdetect/model.jar").getPath()));
-      
-      // identify tokens
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( 
TokenizerAnnotatorPTB.class ) );
-      // merge some tokens
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( 
ContextDependentTokenizerAnnotator.class ) );
-
-      // identify part-of-speech tags
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            POSTagger.class,
-            TypeSystemDescriptionFactory.createTypeSystemDescription(),
-            TypePrioritiesFactory.createTypePriorities( Segment.class, 
Sentence.class, BaseToken.class ),
-            POSTagger.POS_MODEL_FILE_PARAM,
-            "org/apache/ctakes/postagger/models/mayo-pos.zip" ) );
-
-      // identify chunks
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            Chunker.class,
-            Chunker.CHUNKER_MODEL_FILE_PARAM,
-            FileLocator.locateFile( 
"org/apache/ctakes/chunker/models/chunker-model.zip" ),
-            Chunker.CHUNKER_CREATOR_CLASS_PARAM,
-            DefaultChunkCreator.class ) );
-
-      // identify UMLS named entities
-
-      // adjust NP in NP NP to span both
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            ChunkAdjuster.class,
-            ChunkAdjuster.PARAM_CHUNK_PATTERN,
-            new String[] { "NP", "NP" },
-            ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
-            1 ) );
-      // adjust NP in NP PP NP to span all three
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            ChunkAdjuster.class,
-            ChunkAdjuster.PARAM_CHUNK_PATTERN,
-            new String[] { "NP", "PP", "NP" },
-            ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
-            2 ) );
-      // add lookup windows for each NP
-      aggregateBuilder
-            .add( AnalysisEngineFactory.createEngineDescription( 
CopyNPChunksToLookupWindowAnnotations.class ) );
-      // maximize lookup windows
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            OverlapAnnotator.class,
-            "A_ObjectClass",
-            LookupWindowAnnotation.class,
-            "B_ObjectClass",
-            LookupWindowAnnotation.class,
-            "OverlapType",
-            "A_ENV_B",
-            "ActionType",
-            "DELETE",
-            "DeleteAction",
-            new String[] { "selector=B" } ) );
-      // add UMLS on top of lookup windows
-      aggregateBuilder.add( 
DefaultJCasTermAnnotator.createAnnotatorDescription() );
-
-      aggregateBuilder.add( LvgAnnotator.createAnnotatorDescription() );
-
-      // add dependency parser
-      aggregateBuilder.add( 
ClearNLPDependencyParserAE.createAnnotatorDescription() );
-
-      // add semantic role labeler
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( 
ClearNLPSemanticRoleLabelerAE.class ) );
-
-      // add gold standard parses to gold view, and adjust gold view to 
correct a few annotation mis-steps
-      if ( this.treebankDirectory != null ) {
-         aggregateBuilder.add( THYMETreebankReader.getDescription( 
this.treebankDirectory ) );
-         aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( 
TimexAnnotationCorrector.class ) );
-      } else {
-         // add ctakes constituency parses to system view
-         aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( 
ConstituencyParser.class,
-               ConstituencyParser.PARAM_MODEL_FILENAME,
-               "org/apache/ctakes/constituency/parser/models/thyme.bin" ) );
-      }
-      // write out the CAS after all the above annotations
-      aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
-            XMIWriter.class,
-            XMIWriter.PARAM_XMI_DIRECTORY,
-            this.xmiDirectory ) );
-
-      return aggregateBuilder;
-   }
-
-   public static <T extends Annotation> List<T> selectExact( JCas jCas, 
Class<T> annotationClass, Segment segment ) {
-      List<T> annotations = Lists.newArrayList();
-      for ( T annotation : JCasUtil.selectCovered( jCas, annotationClass, 
segment ) ) {
-         if ( annotation.getClass().equals( annotationClass ) ) {
-            annotations.add( annotation );
-         }
-      }
-      return annotations;
-   }
-
-   public static class CopyNPChunksToLookupWindowAnnotations extends 
JCasAnnotator_ImplBase {
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         for ( Chunk chunk : JCasUtil.select( jCas, Chunk.class ) ) {
-            if ( chunk.getChunkType().equals( "NP" ) ) {
-               new LookupWindowAnnotation( jCas, chunk.getBegin(), 
chunk.getEnd() ).addToIndexes();
-            }
-         }
-      }
-   }
-
-   public static class RemoveEnclosedLookupWindows extends 
JCasAnnotator_ImplBase {
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         List<LookupWindowAnnotation> lws = new ArrayList<>( JCasUtil.select( 
jCas, LookupWindowAnnotation.class ) );
-         // we'll navigate backwards so that as we delete things we shorten 
the list from the back
-         for ( int i = lws.size() - 2; i >= 0; i-- ) {
-            LookupWindowAnnotation lw1 = lws.get( i );
-            LookupWindowAnnotation lw2 = lws.get( i + 1 );
-            if ( lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= 
lw2.getEnd() ) {
-               /// lw1 envelops or encloses lw2
-               lws.remove( i + 1 );
-               lw2.removeFromIndexes();
-            }
-         }
-
-      }
-
-   }
-
-   public static class EntityMentionRemover extends JCasAnnotator_ImplBase {
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         for ( EntityMention mention : Lists.newArrayList( JCasUtil.select( 
jCas, EntityMention.class ) ) ) {
-            mention.removeFromIndexes();
-         }
-      }
-   }
-
-   public static class EventMentionRemover extends JCasAnnotator_ImplBase {
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         for ( EventMention mention : Lists.newArrayList( JCasUtil.select( 
jCas, EventMention.class ) ) ) {
-            mention.removeFromIndexes();
-         }
-      }
-   }
-
-   // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets 
fixed
-   public static class SegmentsFromBracketedSectionTagsAnnotator extends 
JCasAnnotator_ImplBase {
-      private static Pattern SECTION_PATTERN = Pattern.compile(
-            "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section 
id=\"?(.*?)\"?\\])",
-            Pattern.DOTALL );
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         boolean foundSections = false;
-         Matcher matcher = SECTION_PATTERN.matcher( jCas.getDocumentText() );
-         while ( matcher.find() ) {
-            Segment segment = new Segment( jCas );
-            segment.setBegin( matcher.start() + matcher.group( 1 ).length() );
-            segment.setEnd( matcher.end() - matcher.group( 3 ).length() );
-            segment.setId( matcher.group( 2 ) );
-            segment.addToIndexes();
-            foundSections = true;
-         }
-         if ( !foundSections ) {
-            Segment segment = new Segment( jCas );
-            segment.setBegin( 0 );
-            segment.setEnd( jCas.getDocumentText().length() );
-            segment.setId( "SIMPLE_SEGMENT" );
-            segment.addToIndexes();
-         }
-      }
-   }
-
-   /**
-    * Grabs the document time from the header
-    */
-   public static class PittHeaderAnnotator extends JCasAnnotator_ImplBase {
-
-     /**
-      * Grabs the document time from the header
-      * {@inheritDoc}
-      */
-     @Override
-     public void process( final JCas jcas ) throws 
AnalysisEngineProcessException {
-       String docText = jcas.getDocumentText();
-       int headerEnd = docText.indexOf("\n", docText.indexOf("[Report 
de-identified"));
-       Segment mainSegment = new Segment(jcas, headerEnd+1, 
docText.length()-1);
-       mainSegment.setId("SIMPLE_SEGMENT");
-       mainSegment.addToIndexes();
-     }
-   }
-
-   static File getXMIFile( File xmiDirectory, File textFile ) {
-          String fileName = textFile.getName();
-          if(!fileName.contains(".xmi")){
-                  fileName += ".xmi";
-          }
-      return new File( xmiDirectory, fileName);// + ".xmi" 
-   }
-
-   static File getXMIFile( File xmiDirectory, JCas jCas ) throws 
AnalysisEngineProcessException {
-      return getXMIFile( xmiDirectory, new File( ViewUriUtil.getURI( jCas 
).getPath() ) );
-   }
-
-   public static class XMIWriter extends JCasAnnotator_ImplBase {
-
-      public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
-
-      @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory = true )
-      private File xmiDirectory;
-
-      @Override
-      public void initialize( UimaContext context ) throws 
ResourceInitializationException {
-         super.initialize( context );
-         if ( !this.xmiDirectory.exists() ) {
-            this.xmiDirectory.mkdirs();
-         }
-      }
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         File xmiFile = getXMIFile( this.xmiDirectory, jCas );
-         try {
-            FileOutputStream outputStream = new FileOutputStream( xmiFile );
-            try {
-               XmiCasSerializer serializer = new XmiCasSerializer( 
jCas.getTypeSystem() );
-               ContentHandler handler = new XMLSerializer( outputStream, false 
).getContentHandler();
-               serializer.serialize( jCas.getCas(), handler );
-            } finally {
-               outputStream.close();
-            }
-         } catch ( SAXException e ) {
-            throw new AnalysisEngineProcessException( e );
-         } catch ( IOException e ) {
-            throw new AnalysisEngineProcessException( e );
-         }
-      }
-   }
-
-   public static class XMIReader extends JCasAnnotator_ImplBase {
-
-      public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
-
-      @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory = true )
-      private File xmiDirectory;
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         File xmiFile = getXMIFile( this.xmiDirectory, jCas );
-         try {
-            FileInputStream inputStream = new FileInputStream( xmiFile );
-            try {
-               XmiCasDeserializer.deserialize( inputStream, jCas.getCas() );
-            } finally {
-               inputStream.close();
-            }
-         } catch ( SAXException e ) {
-            throw new AnalysisEngineProcessException( e );
-         } catch ( IOException e ) {
-            throw new AnalysisEngineProcessException( e );
-         }
-      }
-   }
-
-   public static class TimexAnnotationCorrector extends JCasAnnotator_ImplBase 
{
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         JCas goldView, systemView;
-         try {
-            goldView = jCas.getView( GOLD_VIEW_NAME );
-            systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA );
-         } catch ( CASException e ) {
-            e.printStackTrace();
-            throw new AnalysisEngineProcessException();
-         }
-         for ( TimeMention mention : JCasUtil.select( goldView, 
TimeMention.class ) ) {
-            // for each time expression, get the treebank node with the same 
span.
-            List<TreebankNode> nodes = JCasUtil.selectCovered( systemView, 
TreebankNode.class, mention );
-            TreebankNode sameSpanNode = null;
-            for ( TreebankNode node : nodes ) {
-               if ( node.getBegin() == mention.getBegin() && node.getEnd() == 
mention.getEnd() ) {
-                  sameSpanNode = node;
-                  break;
-               }
-            }
-            if ( sameSpanNode != null ) {
-               // look at node at the position of the timex3.
-               if ( sameSpanNode.getNodeType().equals( "PP" ) ) {
-                  // if it is a PP it should be moved down to the NP
-                  int numChildren = sameSpanNode.getChildren().size();
-                  if ( numChildren == 2 && sameSpanNode.getChildren( 0 
).getNodeType().equals( "IN" ) &&
-                       sameSpanNode.getChildren( 1 ).getNodeType().equals( 
"NP" ) ) {
-                     // move the time span to this node:
-                     TreebankNode mentionNode = sameSpanNode.getChildren( 
numChildren - 1 );
-                     mention.setBegin( mentionNode.getBegin() );
-                     mention.setEnd( mentionNode.getEnd() );
-                  }
-               }
-            } else {
-               // if there is no matching tree span, see if the DT to the left 
would help.
-               // now adjust for missing DT to the left
-               List<TerminalTreebankNode> precedingPreterms = JCasUtil
-                     .selectPreceding( systemView, TerminalTreebankNode.class, 
mention, 1 );
-               if ( precedingPreterms != null && precedingPreterms.size() == 1 
) {
-                  TerminalTreebankNode leftTerm = precedingPreterms.get( 0 );
-                  if ( leftTerm.getNodeType().equals( "DT" ) ) {
-                     // now see if adding this would make it match a tree
-                     List<TreebankNode> matchingNodes = JCasUtil
-                           .selectCovered( systemView, TreebankNode.class, 
leftTerm.getBegin(), mention.getEnd() );
-                     for ( TreebankNode node : matchingNodes ) {
-                        if ( node.getBegin() == leftTerm.getBegin() && 
node.getEnd() == mention.getEnd() ) {
-                           sameSpanNode = node;
-                           break;
-                        }
-                     }
-                     if ( sameSpanNode != null ) {
-                        // adding the DT to the left of th emention made it 
match a tree:
-                        System.err.println(
-                              "Adding DT: " + leftTerm.getCoveredText() + " to 
TIMEX: " + mention.getCoveredText() );
-                        mention.setBegin( leftTerm.getBegin() );
-                     }
-                  }
-               }
-            }
-         }
-      }
-   }
-
-
-   public static class CopyFromGold extends JCasAnnotator_ImplBase {
-
-      public static AnalysisEngineDescription getDescription( Class<?>... 
classes )
-            throws ResourceInitializationException {
-         return AnalysisEngineFactory.createEngineDescription(
-               CopyFromGold.class,
-               CopyFromGold.PARAM_ANNOTATION_CLASSES,
-               classes );
-      }
-
-      public static final String PARAM_ANNOTATION_CLASSES = 
"AnnotationClasses";
-
-      @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, mandatory = 
true )
-      private Class<? extends TOP>[] annotationClasses;
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         JCas goldView, systemView;
-         try {
-            goldView = jCas.getView( GOLD_VIEW_NAME );
-            systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA );
-         } catch ( CASException e ) {
-            throw new AnalysisEngineProcessException( e );
-         }
-         for ( Class<? extends TOP> annotationClass : this.annotationClasses ) 
{
-            for ( TOP annotation : Lists.newArrayList( JCasUtil.select( 
systemView, annotationClass ) ) ) {
-               if ( annotation.getClass().equals( annotationClass ) ) {
-                  annotation.removeFromIndexes();
-               }
-            }
-         }
-         CasCopier copier = new CasCopier( goldView.getCas(), 
systemView.getCas() );
-         Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName( 
CAS.FEATURE_FULL_NAME_SOFA );
-         for ( Class<? extends TOP> annotationClass : this.annotationClasses ) 
{
-            for ( TOP annotation : JCasUtil.select( goldView, annotationClass 
) ) {
-               TOP copy = (TOP)copier.copyFs( annotation );
-               if ( copy instanceof Annotation ) {
-                  copy.setFeatureValue( sofaFeature, systemView.getSofa() );
-               }
-               copy.addToIndexes( systemView );
-            }
-         }
-      }
-   }
-
-   public static class CopyFromSystem extends JCasAnnotator_ImplBase {
-
-      public static AnalysisEngineDescription getDescription( Class<?>... 
classes )
-            throws ResourceInitializationException {
-         return AnalysisEngineFactory.createEngineDescription(
-               CopyFromSystem.class,
-               CopyFromSystem.PARAM_ANNOTATION_CLASSES,
-               classes );
-      }
-
-      public static final String PARAM_ANNOTATION_CLASSES = 
"AnnotationClasses";
-
-      @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, mandatory = 
true )
-      private Class<? extends TOP>[] annotationClasses;
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         JCas goldView, systemView;
-         try {
-            goldView = jCas.getView( GOLD_VIEW_NAME );
-            systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA );
-         } catch ( CASException e ) {
-            throw new AnalysisEngineProcessException( e );
-         }
-         for ( Class<? extends TOP> annotationClass : this.annotationClasses ) 
{
-            for ( TOP annotation : Lists.newArrayList( JCasUtil.select( 
goldView, annotationClass ) ) ) {
-               if ( annotation.getClass().equals( annotationClass ) ) {
-                  annotation.removeFromIndexes();
-               }
-            }
-         }
-         CasCopier copier = new CasCopier( systemView.getCas(), 
goldView.getCas() );
-         Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName( 
CAS.FEATURE_FULL_NAME_SOFA );
-         for ( Class<? extends TOP> annotationClass : this.annotationClasses ) 
{
-            for ( TOP annotation : JCasUtil.select( systemView, 
annotationClass ) ) {
-               TOP copy = (TOP)copier.copyFs( annotation );
-               if ( copy instanceof Annotation ) {
-                  copy.setFeatureValue( sofaFeature, goldView.getSofa() );
-               }
-               copy.addToIndexes( goldView );
-            }
-         }
-      }
-   }
-
-   /*
-    * The following class overrides a ClearTK utility annotator class for 
reading
-    * a text file into a JCas. The code is copy/pasted so that one tiny 
modification
-    * can be made for this corpus -- replace a single odd character (0xc) with 
a
-    * space since it trips up xml output.
-    */
-   public static class UriToDocumentTextAnnotatorCtakes extends 
UriToDocumentTextAnnotator {
-
-      @Override
-      public void process( JCas jCas ) throws AnalysisEngineProcessException {
-         URI uri = ViewUriUtil.getURI( jCas );
-         String content;
-
-         try {
-            content = CharStreams.toString( new InputStreamReader( 
uri.toURL().openStream() ) );
-            content = content.replace( (char)0xc, ' ' );
-            jCas.setSofaDataString( content, "text/plain" );
-         } catch ( MalformedURLException e ) {
-            throw new AnalysisEngineProcessException( e );
-         } catch ( IOException e ) {
-            throw new AnalysisEngineProcessException( e );
-         }
-      }
-   }
-
-   public static class WriteI2B2XML extends JCasAnnotator_ImplBase {
-      public static final String PARAM_OUTPUT_DIR = "PARAM_OUTPUT_DIR";
-      @ConfigurationParameter( mandatory = true, description = "Output 
directory to write xml files to.", name = PARAM_OUTPUT_DIR )
-      protected String outputDir;
-
-      @Override
-      public void process( JCas jcas ) throws AnalysisEngineProcessException {
-         try {
-            // get the output file name from the input file name and output 
directory.
-            File outDir = new File( outputDir );
-            if ( !outDir.exists() ) {
-               outDir.mkdirs();
-            }
-            File inFile = new File( ViewUriUtil.getURI( jcas ) );
-            String outFile = inFile.getName().replace( ".txt", "" );
-
-            // build the xml
-            DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
-            DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
-            Document doc = docBuilder.newDocument();
-            Element rootElement = doc.createElement( 
"ClinicalNarrativeTemporalAnnotation" );
-            Element textElement = doc.createElement( "TEXT" );
-            Element tagsElement = doc.createElement( "TAGS" );
-            textElement.setTextContent( jcas.getDocumentText() );
-            rootElement.appendChild( textElement );
-            rootElement.appendChild( tagsElement );
-            doc.appendChild( rootElement );
-
-            Map<IdentifiedAnnotation, String> argToId = new HashMap<>();
-            int id = 0;
-            for ( TimeMention timex : JCasUtil.select( jcas, TimeMention.class 
) ) {
-               Element timexElement = doc.createElement( "TIMEX3" );
-               String timexID = "T" + id;
-               id++;
-               argToId.put( timex, timexID );
-               timexElement.setAttribute( "id", timexID );
-               timexElement.setAttribute( "start", String.valueOf( 
timex.getBegin() + 1 ) );
-               timexElement.setAttribute( "end", String.valueOf( 
timex.getEnd() + 1 ) );
-               timexElement.setAttribute( "text", timex.getCoveredText() );
-               timexElement.setAttribute( "type", "NA" );
-               timexElement.setAttribute( "val", "NA" );
-               timexElement.setAttribute( "mod", "NA" );
-               tagsElement.appendChild( timexElement );
-            }
-
-            id = 0;
-            for ( EventMention event : JCasUtil.select( jcas, 
EventMention.class ) ) {
-               if ( event.getClass().equals( EventMention.class ) ) {
-                  // this ensures we are only looking at THYME events and not 
ctakes-dictionary-lookup events
-                  Element eventEl = doc.createElement( "EVENT" );
-                  String eventID = "E" + id;
-                  id++;
-                  argToId.put( event, eventID );
-                  eventEl.setAttribute( "id", eventID );
-                  eventEl.setAttribute( "start", String.valueOf( 
event.getBegin() + 1 ) );
-                  eventEl.setAttribute( "end", String.valueOf( event.getEnd() 
+ 1 ) );
-                  eventEl.setAttribute( "text", event.getCoveredText() );
-                  eventEl.setAttribute( "modality", "NA" );
-                  eventEl.setAttribute( "polarity", "NA" );
-                  eventEl.setAttribute( "type", "NA" );
-                  tagsElement.appendChild( eventEl );
-               }
-            }
-
-            id = 0;
-            for ( TemporalTextRelation rel : JCasUtil.select( jcas, 
TemporalTextRelation.class ) ) {
-               Element linkEl = doc.createElement( "TLINK" );
-               String linkID = "TL" + id;
-               id++;
-               linkEl.setAttribute( "id", linkID );
-               Annotation arg1 = rel.getArg1().getArgument();
-               linkEl.setAttribute( "fromID", argToId.get( arg1 ) );
-               linkEl.setAttribute( "fromText", arg1.getCoveredText() );
-               Annotation arg2 = rel.getArg2().getArgument();
-               if ( arg2 != null ) {
-                  linkEl.setAttribute( "toID", argToId.get( arg2 ) );
-                  linkEl.setAttribute( "toText", arg2.getCoveredText() );
-               } else {
-                  linkEl.setAttribute( "toID", "Discharge" );
-                  linkEl.setAttribute( "toText", "Discharge" );
-               }
-               linkEl.setAttribute( "type", rel.getCategory() );
-               tagsElement.appendChild( linkEl );
-            }
-
-            // boilerplate xml-writing code:
-            TransformerFactory transformerFactory = 
TransformerFactory.newInstance();
-            Transformer transformer = transformerFactory.newTransformer();
-            transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
-            transformer.setOutputProperty( OutputKeys.METHOD, "xml" );
-            DOMSource source = new DOMSource( doc );
-            StreamResult result = new StreamResult( new File( outputDir, 
outFile ) );
-            transformer.transform( source, result );
-         } catch ( ParserConfigurationException e ) {
-            e.printStackTrace();
-            throw new AnalysisEngineProcessException( e );
-         } catch ( TransformerConfigurationException e ) {
-            e.printStackTrace();
-            throw new AnalysisEngineProcessException( e );
-         } catch ( TransformerException e ) {
-            e.printStackTrace();
-            throw new AnalysisEngineProcessException( e );
-         }
-
-      }
-
-   }
-
-   public static class WriteAnaforaXML extends JCasAnnotator_ImplBase {
-      public static final String PARAM_OUTPUT_DIR = "PARAM_OUTPUT_DIR";
-      @ConfigurationParameter( mandatory = true, description = "Output 
directory to write xml files to.", name = PARAM_OUTPUT_DIR )
-      protected String outputDir;
-      
-      public static final String PARAM_PROB_VIEW = "ProbView";
-      @ConfigurationParameter(name=PARAM_PROB_VIEW, mandatory=false)
-      public String probViewname = null;
-
-      @Override
-      public void process( JCas jcas ) throws AnalysisEngineProcessException {
-         try {
-            // get the output file name from the input file name and output 
directory.
-
-            File inFile = new File( ViewUriUtil.getURI( jcas ) );
-            String outFile = inFile.getName().replace( ".txt", "" );
-            File outDir = new File( outputDir, outFile );
-            if ( !outDir.exists() ) {
-               outDir.mkdirs();
-            }
-
-            
-            // get maps from ids to entities and relations:
-            JCas probView = (probViewname == null ? null : 
jcas.getView(probViewname));
-            Map<Integer, List<EventMention>> mentions = probViewname == null? 
null : getMentionIdMap(jcas, probView);
-            Map<String, List<TemporalTextRelation>> rels = probViewname == 
null ? null : getRelationIdMap(probView);
-            
-            // build the xml
-            DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
-            DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
-            Document doc = docBuilder.newDocument();
-
-            Element rootElement = doc.createElement( "data" );
-
-            //info element
-            Element infoElement = doc.createElement( "info" );
-            Element saveTime = doc.createElement( "savetime" );
-            saveTime.setTextContent( "2015-0123-10:21" );
-            Element progress = doc.createElement( "progress" );
-            progress.setTextContent( "completed" );
-            infoElement.appendChild( saveTime );
-            infoElement.appendChild( progress );
-
-            //schema element
-            Element schema = doc.createElement( "schema" );
-            schema.setAttribute( "path", "./" );
-            schema.setAttribute( "protocol", "file" );
-            schema.setTextContent( "temporal-schema.xml" );
-
-            Element annoElement = doc.createElement( "annotations" );
-            Map<IdentifiedAnnotation, String> argToId = new HashMap<>();
-            int id = 1;
-            for ( EventMention event : JCasUtil.select( jcas, 
EventMention.class ) ) {
-               if ( event.getClass().equals( EventMention.class ) ) {
-                  // this ensures we are only looking at THYME events and not 
ctakes-dictionary-lookup events
-                  Element eventEl = doc.createElement( "entity" );
-                  String eventID = id + "@e@" + outFile + "@system";
-                  id++;
-                  argToId.put( event, eventID );
-                  Element idE = doc.createElement( "id" );
-                  idE.setTextContent( eventID );
-                  Element spanE = doc.createElement( "span" );
-                  spanE.setTextContent( String.valueOf( event.getBegin() ) + 
"," + String.valueOf( event.getEnd() ) );
-                  Element typeE = doc.createElement( "type" );
-                  typeE.setTextContent( "EVENT" );
-                  Element parentTE = doc.createElement( "parentsType" );
-                  parentTE.setTextContent( "TemporalEntities" );
-                  //add properties
-                  Element property = doc.createElement( "properties" );
-                  Element docTimeRE = doc.createElement( "DocTimeRel" );
-                  String dtrContent = null;
-                  if(probViewname == null){
-                    dtrContent = 
event.getEvent().getProperties().getDocTimeRel();
-                  }else{
-                    StringBuffer buff = new StringBuffer();
-                    for(EventMention probMention : 
mentions.get(event.getId())){
-                      
buff.append(probMention.getEvent().getProperties().getDocTimeRel());
-                      buff.append(':');
-                      buff.append(probMention.getConfidence());
-                      buff.append("::");
-                    }
-                    dtrContent = buff.substring(0, buff.length()-2);
-                  }
-                  docTimeRE.setTextContent( dtrContent );
-                  Element eventTypeE = doc.createElement( "Type" );
-                  eventTypeE.setTextContent( "N/A" );
-                  Element degreeE = doc.createElement( "Degree" );
-                  degreeE.setTextContent( "N/A" );
-                  Element polarityE = doc.createElement( "Polarity" );
-                  String polarity = "UNKNOWN";
-                  int polarityInt = event.getPolarity();
-                  if ( polarityInt == CONST.NE_POLARITY_NEGATION_ABSENT ) {
-                     polarity = "POS";
-                  } else if ( polarityInt == 
CONST.NE_POLARITY_NEGATION_PRESENT ) {
-                     polarity = "NEG";
-                  }
-                  polarityE.setTextContent( polarity );
-                  Element ctexModE = doc.createElement( "ContextualModality" );
-                  ctexModE.setTextContent( 
event.getEvent().getProperties().getContextualModality() );
-                  Element ctexAspE = doc.createElement( "ContextualAspect" );
-                  ctexAspE.setTextContent( 
event.getEvent().getProperties().getContextualAspect() );
-                  Element permE = doc.createElement( "Permanence" );
-                  permE.setTextContent( "UNDETERMINED" );
-                  property.appendChild( docTimeRE );
-                  property.appendChild( polarityE );
-                  property.appendChild( degreeE );
-                  property.appendChild( eventTypeE );
-                  property.appendChild( ctexModE );
-                  property.appendChild( ctexAspE );
-                  property.appendChild( permE );
-                  eventEl.appendChild( idE );
-                  eventEl.appendChild( spanE );
-                  eventEl.appendChild( typeE );
-                  eventEl.appendChild( parentTE );
-                  eventEl.appendChild( property );
-                  annoElement.appendChild( eventEl );
-               }
-            }
-            for ( TimeMention timex : JCasUtil.select( jcas, TimeMention.class 
) ) {
-               Element timexElement = doc.createElement( "entity" );
-               String timexID = id + "@e@" + outFile + "@system";
-               id++;//18@e@ID006_clinic_016@gold
-               argToId.put( timex, timexID );
-               Element idE = doc.createElement( "id" );
-               idE.setTextContent( timexID );
-               Element spanE = doc.createElement( "span" );
-               spanE.setTextContent( String.valueOf( timex.getBegin() ) + "," 
+ String.valueOf( timex.getEnd() ) );
-               Element typeE = doc.createElement( "type" );
-               Element parentTE = doc.createElement( "parentsType" );
-               parentTE.setTextContent( "TemporalEntities" );
-               //add properties
-               Element property = doc.createElement( "properties" );
-               String timeClass = timex.getTimeClass();
-               
-               //add normalized timex
-               String value = Utils.getTimexMLValue(timex.getCoveredText());
-               if(value != null){
-                  property.setTextContent( value );
-               }
-               
-               if ( timeClass!=null && (timeClass.equals( "DOCTIME" ) || 
timeClass.equals( "SECTIONTIME" ) ) ) {
-                  typeE.setTextContent( timeClass );
-                  property.setTextContent( "" );
-               } else {
-                  typeE.setTextContent( "TIMEX3" );
-                  Element classE = doc.createElement( "Class" );
-                  classE.setTextContent( timeClass );
-                  property.appendChild( classE );
-               }
-
-               timexElement.appendChild( idE );
-               timexElement.appendChild( spanE );
-               timexElement.appendChild( typeE );
-               timexElement.appendChild( property );
-               annoElement.appendChild( timexElement );
-            }
-
-
-            id = 1;
-            if(probViewname == null){
-              for ( TemporalTextRelation rel : JCasUtil.select( jcas, 
TemporalTextRelation.class ) ) {
-                Annotation arg1 = rel.getArg1().getArgument();
-                Annotation arg2 = rel.getArg2().getArgument();
-                String arg1Content = argToId.get( arg1 );
-                String arg2Content = argToId.get( arg2 );
-                String relContent = rel.getCategory();
-                annoElement.appendChild(addRelationElement(doc, id, 
relContent, arg1Content, arg2Content, outFile));
-                id++;
-              }
-            }else{
-              // need to keep track of which relations we've printed since 
they don't get grouped in the CAS
-              for(String key : rels.keySet()){
-                String arg1Content = null;
-                String arg2Content = null;
-                StringBuffer buff = new StringBuffer();
-                for(TemporalTextRelation probRel : rels.get(key)){
-                  buff.append(probRel.getCategory());
-                  buff.append(':');
-                  buff.append(probRel.getConfidence());
-                  buff.append("::");
-                  if(arg1Content == null){
-                    arg1Content = argToId.get(probRel.getArg1().getArgument());
-                    arg2Content = argToId.get(probRel.getArg2().getArgument());
-                  }
-                }
-                String relContent =  buff.substring(0, buff.length()-2);
-                annoElement.appendChild(addRelationElement(doc, id, 
relContent, arg1Content, arg2Content, outFile));
-                id++;
-              }
-            }
-            
-            rootElement.appendChild( infoElement );
-            rootElement.appendChild( schema );
-            rootElement.appendChild( annoElement );
-            doc.appendChild( rootElement );
-
-            // boilerplate xml-writing code:
-            TransformerFactory transformerFactory = 
TransformerFactory.newInstance();
-            Transformer transformer = transformerFactory.newTransformer();
-            transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
-            transformer.setOutputProperty( OutputKeys.METHOD, "xml" );
-            DOMSource source = new DOMSource( doc );
-            StreamResult result = new StreamResult( new File( outDir, outFile 
+ ".xml" ) );
-            transformer.transform( source, result );
-         } catch ( ParserConfigurationException e ) {
-           e.printStackTrace();
-           throw new AnalysisEngineProcessException( e );
-         } catch ( TransformerConfigurationException e ) {
-           e.printStackTrace();
-           throw new AnalysisEngineProcessException( e );
-         } catch ( TransformerException e ) {
-           e.printStackTrace();
-           throw new AnalysisEngineProcessException( e );
-         } catch (CASException e) {
-           e.printStackTrace();
-           throw new AnalysisEngineProcessException( e );
-        }
-
-      }
-      
-      private static Element addRelationElement(Document doc, int id,  String 
relContent, String arg1Content, String arg2Content, String outFile){
-        Element linkEl = doc.createElement( "relation" );
-        String linkID = id + "@r@" + outFile + "@system";
-
-        Element idE = doc.createElement( "id" );
-        idE.setTextContent( linkID );
-        Element typeE = doc.createElement( "type" );
-        typeE.setTextContent( "TLINK" );
-        Element parentTE = doc.createElement( "parentsType" );
-        parentTE.setTextContent( "TemporalRelations" );
-        //add properties
-        Element property = doc.createElement( "properties" );
-
-        Element sourceE = doc.createElement( "Source" );
-        sourceE.setTextContent( arg1Content );
-        Element relTypeE = doc.createElement( "Type" );
-
-        relTypeE.setTextContent( relContent );
-        Element targetE = doc.createElement( "Target" );
-        targetE.setTextContent( arg2Content );
-
-        property.appendChild( sourceE );
-        property.appendChild( relTypeE );
-        property.appendChild( targetE );
-
-        linkEl.appendChild( idE );
-        linkEl.appendChild( typeE );
-        linkEl.appendChild( parentTE );
-        linkEl.appendChild( property );
-        return linkEl;        
-      }
-      
-      private static Map<Integer, List<EventMention>> getMentionIdMap(JCas 
jcas, JCas probView){
-        HashMap<Integer, List<EventMention>> map = new HashMap<>();
-        
-        for(EventMention mention : JCasUtil.select(jcas, EventMention.class)){
-          List<EventMention> variations = new ArrayList<>();
-          for(EventMention probMention : JCasUtil.select(probView, 
EventMention.class)){
-            if(mention.getId() == probMention.getId()){
-              variations.add(probMention);
-            }
-          }
-          map.put(mention.getId(), variations);
-        }
-        return map;
-      }
-      
-      private static Map<String, List<TemporalTextRelation>> 
getRelationIdMap(JCas probView){
-        HashMap<String, List<TemporalTextRelation>> map = new HashMap<>();
-        
-        for(TemporalTextRelation probRel : JCasUtil.select(probView, 
TemporalTextRelation.class)){
-          String idStr = getRelationId(probRel);
-          if(!map.containsKey(idStr)){
-            map.put(idStr, new ArrayList<TemporalTextRelation>());
-          }
-          List<TemporalTextRelation> variations = map.get(idStr);
-          variations.add(probRel);          
-        }
-        
-        return map;
-      }
-   }
-   public static String getRelationId(TemporalTextRelation rel){
-     StringBuffer buffer = new StringBuffer();
-     
if(rel.getArg1().getArgument().getClass().getSimpleName().equals("EventMention")){
-       buffer.append('e');
-     }else{
-       buffer.append('t');
-     }
-     
buffer.append(((IdentifiedAnnotation)rel.getArg1().getArgument()).getId());
-     buffer.append(':');
-     
if(rel.getArg2().getArgument().getClass().getSimpleName().equals("EventMention")){
-       buffer.append('e');
-     }else{
-       buffer.append('t');
-     }
-     
buffer.append(((IdentifiedAnnotation)rel.getArg2().getArgument()).getId());
-     return buffer.toString();     
-   }
+                */
+               return UriCollectionReader.getCollectionReaderFromFiles( 
collectedFiles );
+       }
+
+       protected AggregateBuilder getPreprocessorAggregateBuilder() throws 
Exception {
+               return this.xmiExists
+                               ? 
this.getXMIReadingPreprocessorAggregateBuilder()
+                                               : 
this.getXMIWritingPreprocessorAggregateBuilder();
+       }
+
+       protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() 
throws UIMAException {
+               AggregateBuilder aggregateBuilder = new AggregateBuilder();
+               aggregateBuilder.add( 
UriToDocumentTextAnnotator.getDescription() );
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               XMIReader.class,
+                               XMIReader.PARAM_XMI_DIRECTORY,
+                               this.xmiDirectory ) );
+               return aggregateBuilder;
+       }
+
+       protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
+                       throws Exception {
+               AggregateBuilder aggregateBuilder = new AggregateBuilder();
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription( 
UriToDocumentTextAnnotatorCtakes.class ) );
+
+               // read manual annotations into gold view
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               ViewCreatorAnnotator.class,
+                               ViewCreatorAnnotator.PARAM_VIEW_NAME,
+                               GOLD_VIEW_NAME ) );
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               ViewTextCopierAnnotator.class,
+                               ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
+                               CAS.NAME_DEFAULT_SOFA,
+                               
ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
+                               GOLD_VIEW_NAME ) );
+               switch ( this.xmlFormat ) {
+               case Anafora:
+                       if(this.subcorpus == Subcorpus.DeepPhe){
+                               aggregateBuilder.add(
+                                               
AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class,
+                                                               
THYMEAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY,
+                                                               
this.xmlDirectory,
+                                                               
THYMEAnaforaXMLReader.PARAM_ANAFORA_XML_SUFFIXES,
+                                                               new String[]{} 
),
+                                                               
CAS.NAME_DEFAULT_SOFA,
+                                                               GOLD_VIEW_NAME 
);
+                       }else{
+                               aggregateBuilder.add(
+                                               
THYMEAnaforaXMLReader.getDescription( this.xmlDirectory ),
+                                               CAS.NAME_DEFAULT_SOFA,
+                                               GOLD_VIEW_NAME );
+                       }
+                       break;
+               case Knowtator:
+                       aggregateBuilder.add(
+                                       THYMEKnowtatorXMLReader.getDescription( 
this.xmlDirectory ),
+                                       CAS.NAME_DEFAULT_SOFA,
+                                       GOLD_VIEW_NAME );
+                       break;
+               case I2B2:
+                       aggregateBuilder.add(
+                                       I2B2TemporalXMLReader.getDescription( 
this.xmlDirectory ),
+                                       CAS.NAME_DEFAULT_SOFA,
+                                       GOLD_VIEW_NAME );
+                       break;
+               }
+
+               // identify segments
+               if(this.subcorpus == Subcorpus.DeepPhe){
+                       
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PittHeaderAnnotator.class));
+               }else{
+                       aggregateBuilder
+                       .add( AnalysisEngineFactory.createEngineDescription( 
SegmentsFromBracketedSectionTagsAnnotator.class ) );
+               }
+               // identify sentences
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               SentenceDetector.class,
+                               SentenceDetector.SD_MODEL_FILE_PARAM,
+                               
"org/apache/ctakes/core/sentdetect/sd-med-model.zip" ) );
+               //      
aggregateBuilder.add(SentenceDetectorAnnotator.getDescription(FileLocator.locateFile("org/apache/ctakes/core/sentdetect/model.jar").getPath()));
+
+               // identify tokens
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription( TokenizerAnnotatorPTB.class ) );
+               // merge some tokens
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription( 
ContextDependentTokenizerAnnotator.class ) );
+
+               // identify part-of-speech tags
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               POSTagger.class,
+                               
TypeSystemDescriptionFactory.createTypeSystemDescription(),
+                               TypePrioritiesFactory.createTypePriorities( 
Segment.class, Sentence.class, BaseToken.class ),
+                               POSTagger.POS_MODEL_FILE_PARAM,
+                               
"org/apache/ctakes/postagger/models/mayo-pos.zip" ) );
+
+               // identify chunks
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               Chunker.class,
+                               Chunker.CHUNKER_MODEL_FILE_PARAM,
+                               FileLocator.locateFile( 
"org/apache/ctakes/chunker/models/chunker-model.zip" ),
+                               Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+                               DefaultChunkCreator.class ) );
+
+               // identify UMLS named entities
+
+               // adjust NP in NP NP to span both
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               ChunkAdjuster.class,
+                               ChunkAdjuster.PARAM_CHUNK_PATTERN,
+                               new String[] { "NP", "NP" },
+                               ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+                               1 ) );
+               // adjust NP in NP PP NP to span all three
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               ChunkAdjuster.class,
+                               ChunkAdjuster.PARAM_CHUNK_PATTERN,
+                               new String[] { "NP", "PP", "NP" },
+                               ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+                               2 ) );
+               // add lookup windows for each NP
+               aggregateBuilder
+               .add( AnalysisEngineFactory.createEngineDescription( 
CopyNPChunksToLookupWindowAnnotations.class ) );
+               // maximize lookup windows
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               OverlapAnnotator.class,
+                               "A_ObjectClass",
+                               LookupWindowAnnotation.class,
+                               "B_ObjectClass",
+                               LookupWindowAnnotation.class,
+                               "OverlapType",
+                               "A_ENV_B",
+                               "ActionType",
+                               "DELETE",
+                               "DeleteAction",
+                               new String[] { "selector=B" } ) );
+               // add UMLS on top of lookup windows
+               aggregateBuilder.add( 
DefaultJCasTermAnnotator.createAnnotatorDescription() );
+
+               aggregateBuilder.add( LvgAnnotator.createAnnotatorDescription() 
);
+
+               // add dependency parser
+               aggregateBuilder.add( 
ClearNLPDependencyParserAE.createAnnotatorDescription() );
+
+               // add semantic role labeler
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription( 
ClearNLPSemanticRoleLabelerAE.class ) );
+
+               // add gold standard parses to gold view, and adjust gold view 
to correct a few annotation mis-steps
+               if ( this.treebankDirectory != null ) {
+                       aggregateBuilder.add( 
THYMETreebankReader.getDescription( this.treebankDirectory ) );
+                       aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription( TimexAnnotationCorrector.class ) 
);
+               } else {
+                       // add ctakes constituency parses to system view
+                       aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class,
+                                       ConstituencyParser.PARAM_MODEL_FILENAME,
+                                       
"org/apache/ctakes/constituency/parser/models/thyme.bin" ) );
+               }
+               // write out the CAS after all the above annotations
+               aggregateBuilder.add( 
AnalysisEngineFactory.createEngineDescription(
+                               XMIWriter.class,
+                               XMIWriter.PARAM_XMI_DIRECTORY,
+                               this.xmiDirectory ) );
+
+               return aggregateBuilder;
+       }
+
+       public static <T extends Annotation> List<T> selectExact( JCas jCas, 
Class<T> annotationClass, Segment segment ) {
+               List<T> annotations = Lists.newArrayList();
+               for ( T annotation : JCasUtil.selectCovered( jCas, 
annotationClass, segment ) ) {
+                       if ( annotation.getClass().equals( annotationClass ) ) {
+                               annotations.add( annotation );
+                       }
+               }
+               return annotations;
+       }
+
+       public static class CopyNPChunksToLookupWindowAnnotations extends 
JCasAnnotator_ImplBase {
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       for ( Chunk chunk : JCasUtil.select( jCas, Chunk.class 
) ) {
+                               if ( chunk.getChunkType().equals( "NP" ) ) {
+                                       new LookupWindowAnnotation( jCas, 
chunk.getBegin(), chunk.getEnd() ).addToIndexes();
+                               }
+                       }
+               }
+       }
+
+       public static class RemoveEnclosedLookupWindows extends 
JCasAnnotator_ImplBase {
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       List<LookupWindowAnnotation> lws = new ArrayList<>( 
JCasUtil.select( jCas, LookupWindowAnnotation.class ) );
+                       // we'll navigate backwards so that as we delete things 
we shorten the list from the back
+                       for ( int i = lws.size() - 2; i >= 0; i-- ) {
+                               LookupWindowAnnotation lw1 = lws.get( i );
+                               LookupWindowAnnotation lw2 = lws.get( i + 1 );
+                               if ( lw1.getBegin() <= lw2.getBegin() && 
lw1.getEnd() >= lw2.getEnd() ) {
+                                       /// lw1 envelops or encloses lw2
+                                       lws.remove( i + 1 );
+                                       lw2.removeFromIndexes();
+                               }
+                       }
+
+               }
+
+       }
+
+       public static class EntityMentionRemover extends JCasAnnotator_ImplBase 
{
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       for ( EntityMention mention : Lists.newArrayList( 
JCasUtil.select( jCas, EntityMention.class ) ) ) {
+                               mention.removeFromIndexes();
+                       }
+               }
+       }
+
+       public static class EventMentionRemover extends JCasAnnotator_ImplBase {
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       for ( EventMention mention : Lists.newArrayList( 
JCasUtil.select( jCas, EventMention.class ) ) ) {
+                               mention.removeFromIndexes();
+                       }
+               }
+       }
+
+       // replace this with SimpleSegmentWithTagsAnnotator if that code ever 
gets fixed
+       public static class SegmentsFromBracketedSectionTagsAnnotator extends 
JCasAnnotator_ImplBase {
+               private static Pattern SECTION_PATTERN = Pattern.compile(
+                               "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end 
section id=\"?(.*?)\"?\\])",
+                               Pattern.DOTALL );
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       boolean foundSections = false;
+                       Matcher matcher = SECTION_PATTERN.matcher( 
jCas.getDocumentText() );
+                       while ( matcher.find() ) {
+                               Segment segment = new Segment( jCas );
+                               segment.setBegin( matcher.start() + 
matcher.group( 1 ).length() );
+                               segment.setEnd( matcher.end() - matcher.group( 
3 ).length() );
+                               segment.setId( matcher.group( 2 ) );
+                               segment.addToIndexes();
+                               foundSections = true;
+                       }
+                       if ( !foundSections ) {
+                               Segment segment = new Segment( jCas );
+                               segment.setBegin( 0 );
+                               segment.setEnd( jCas.getDocumentText().length() 
);
+                               segment.setId( "SIMPLE_SEGMENT" );
+                               segment.addToIndexes();
+                       }
+               }
+       }
+
+       /**
+        * Grabs the document time from the header
+        */
+       public static class PittHeaderAnnotator extends JCasAnnotator_ImplBase {
+
+               /**
+                * Grabs the document time from the header
+                * {@inheritDoc}
+                */
+               @Override
+               public void process( final JCas jcas ) throws 
AnalysisEngineProcessException {
+                       String docText = jcas.getDocumentText();
+                       int headerEnd = docText.indexOf("\n", 
docText.indexOf("[Report de-identified"));
+                       Segment mainSegment = new Segment(jcas, headerEnd+1, 
docText.length()-1);
+                       mainSegment.setId("SIMPLE_SEGMENT");
+                       mainSegment.addToIndexes();
+               }
+       }
+
+       static File getXMIFile( File xmiDirectory, File textFile ) {
+               String fileName = textFile.getName();
+               if(!fileName.contains(".xmi")){
+                       fileName += ".xmi";
+               }
+               return new File( xmiDirectory, fileName);// + ".xmi" 
+       }
+
+       static File getXMIFile( File xmiDirectory, JCas jCas ) throws 
AnalysisEngineProcessException {
+               return getXMIFile( xmiDirectory, new File( ViewUriUtil.getURI( 
jCas ).getPath() ) );
+       }
+
+       public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+               public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+               @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory 
= true )
+               private File xmiDirectory;
+
+               @Override
+               public void initialize( UimaContext context ) throws 
ResourceInitializationException {
+                       super.initialize( context );
+                       if ( !this.xmiDirectory.exists() ) {
+                               this.xmiDirectory.mkdirs();
+                       }
+               }
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       File xmiFile = getXMIFile( this.xmiDirectory, jCas );
+                       try {
+                               FileOutputStream outputStream = new 
FileOutputStream( xmiFile );
+                               try {
+                                       XmiCasSerializer serializer = new 
XmiCasSerializer( jCas.getTypeSystem() );
+                                       ContentHandler handler = new 
XMLSerializer( outputStream, false ).getContentHandler();
+                                       serializer.serialize( jCas.getCas(), 
handler );
+                               } finally {
+                                       outputStream.close();
+                               }
+                       } catch ( SAXException e ) {
+                               throw new AnalysisEngineProcessException( e );
+                       } catch ( IOException e ) {
+                               throw new AnalysisEngineProcessException( e );
+                       }
+               }
+       }
+
+       public static class XMIReader extends JCasAnnotator_ImplBase {
+
+               public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+               @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory 
= true )
+               private File xmiDirectory;
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       File xmiFile = getXMIFile( this.xmiDirectory, jCas );
+                       try {
+                               FileInputStream inputStream = new 
FileInputStream( xmiFile );
+                               try {
+                                       XmiCasDeserializer.deserialize( 
inputStream, jCas.getCas() );
+                               } finally {
+                                       inputStream.close();
+                               }
+                       } catch ( SAXException e ) {
+                               throw new AnalysisEngineProcessException( e );
+                       } catch ( IOException e ) {
+                               throw new AnalysisEngineProcessException( e );
+                       }
+               }
+       }
+
+       public static class TimexAnnotationCorrector extends 
JCasAnnotator_ImplBase {
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       JCas goldView, systemView;
+                       try {
+                               goldView = jCas.getView( GOLD_VIEW_NAME );
+                               systemView = jCas.getView( 
CAS.NAME_DEFAULT_SOFA );
+                       } catch ( CASException e ) {
+                               e.printStackTrace();
+                               throw new AnalysisEngineProcessException();
+                       }
+                       for ( TimeMention mention : JCasUtil.select( goldView, 
TimeMention.class ) ) {
+                               // for each time expression, get the treebank 
node with the same span.
+                               List<TreebankNode> nodes = 
JCasUtil.selectCovered( systemView, TreebankNode.class, mention );
+                               TreebankNode sameSpanNode = null;
+                               for ( TreebankNode node : nodes ) {
+                                       if ( node.getBegin() == 
mention.getBegin() && node.getEnd() == mention.getEnd() ) {
+                                               sameSpanNode = node;
+                                               break;
+                                       }
+                               }
+                               if ( sameSpanNode != null ) {
+                                       // look at node at the position of the 
timex3.
+                                       if ( sameSpanNode.getNodeType().equals( 
"PP" ) ) {
+                                               // if it is a PP it should be 
moved down to the NP
+                                               int numChildren = 
sameSpanNode.getChildren().size();
+                                               if ( numChildren == 2 && 
sameSpanNode.getChildren( 0 ).getNodeType().equals( "IN" ) &&
+                                                               
sameSpanNode.getChildren( 1 ).getNodeType().equals( "NP" ) ) {
+                                                       // move the time span 
to this node:
+                                                       TreebankNode 
mentionNode = sameSpanNode.getChildren( numChildren - 1 );
+                                                       mention.setBegin( 
mentionNode.getBegin() );
+                                                       mention.setEnd( 
mentionNode.getEnd() );
+                                               }
+                                       }
+                               } else {
+                                       // if there is no matching tree span, 
see if the DT to the left would help.
+                                       // now adjust for missing DT to the left
+                                       List<TerminalTreebankNode> 
precedingPreterms = JCasUtil
+                                                       .selectPreceding( 
systemView, TerminalTreebankNode.class, mention, 1 );
+                                       if ( precedingPreterms != null && 
precedingPreterms.size() == 1 ) {
+                                               TerminalTreebankNode leftTerm = 
precedingPreterms.get( 0 );
+                                               if ( 
leftTerm.getNodeType().equals( "DT" ) ) {
+                                                       // now see if adding 
this would make it match a tree
+                                                       List<TreebankNode> 
matchingNodes = JCasUtil
+                                                                       
.selectCovered( systemView, TreebankNode.class, leftTerm.getBegin(), 
mention.getEnd() );
+                                                       for ( TreebankNode node 
: matchingNodes ) {
+                                                               if ( 
node.getBegin() == leftTerm.getBegin() && node.getEnd() == mention.getEnd() ) {
+                                                                       
sameSpanNode = node;
+                                                                       break;
+                                                               }
+                                                       }
+                                                       if ( sameSpanNode != 
null ) {
+                                                               // adding the 
DT to the left of th emention made it match a tree:
+                                                                       
System.err.println(
+                                                                               
        "Adding DT: " + leftTerm.getCoveredText() + " to TIMEX: " + 
mention.getCoveredText() );
+                                                                       
mention.setBegin( leftTerm.getBegin() );
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+
+       public static class CopyFromGold extends JCasAnnotator_ImplBase {
+
+               public static AnalysisEngineDescription getDescription( 
Class<?>... classes )
+                               throws ResourceInitializationException {
+                       return AnalysisEngineFactory.createEngineDescription(
+                                       CopyFromGold.class,
+                                       CopyFromGold.PARAM_ANNOTATION_CLASSES,
+                                       classes );
+               }
+
+               public static final String PARAM_ANNOTATION_CLASSES = 
"AnnotationClasses";
+
+               @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, 
mandatory = true )
+               private Class<? extends TOP>[] annotationClasses;
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       JCas goldView, systemView;
+                       try {
+                               goldView = jCas.getView( GOLD_VIEW_NAME );
+                               systemView = jCas.getView( 
CAS.NAME_DEFAULT_SOFA );
+                       } catch ( CASException e ) {
+                               throw new AnalysisEngineProcessException( e );
+                       }
+                       for ( Class<? extends TOP> annotationClass : 
this.annotationClasses ) {
+                               for ( TOP annotation : Lists.newArrayList( 
JCasUtil.select( systemView, annotationClass ) ) ) {
+                                       if ( annotation.getClass().equals( 
annotationClass ) ) {
+                                               annotation.removeFromIndexes();
+                                       }
+                               }
+                       }
+                       CasCopier copier = new CasCopier( goldView.getCas(), 
systemView.getCas() );
+                       Feature sofaFeature = 
jCas.getTypeSystem().getFeatureByFullName( CAS.FEATURE_FULL_NAME_SOFA );
+                       for ( Class<? extends TOP> annotationClass : 
this.annotationClasses ) {
+                               for ( TOP annotation : JCasUtil.select( 
goldView, annotationClass ) ) {
+                                       TOP copy = (TOP)copier.copyFs( 
annotation );
+                                       if ( copy instanceof Annotation ) {
+                                               copy.setFeatureValue( 
sofaFeature, systemView.getSofa() );
+                                       }
+                                       copy.addToIndexes( systemView );
+                               }
+                       }
+               }
+       }
+
+       public static class CopyFromSystem extends JCasAnnotator_ImplBase {
+
+               public static AnalysisEngineDescription getDescription( 
Class<?>... classes )
+                               throws ResourceInitializationException {
+                       return AnalysisEngineFactory.createEngineDescription(
+                                       CopyFromSystem.class,
+                                       CopyFromSystem.PARAM_ANNOTATION_CLASSES,
+                                       classes );
+               }
+
+               public static final String PARAM_ANNOTATION_CLASSES = 
"AnnotationClasses";
+
+               @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, 
mandatory = true )
+               private Class<? extends TOP>[] annotationClasses;
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       JCas goldView, systemView;
+                       try {
+                               goldView = jCas.getView( GOLD_VIEW_NAME );
+                               systemView = jCas.getView( 
CAS.NAME_DEFAULT_SOFA );
+                       } catch ( CASException e ) {
+                               throw new AnalysisEngineProcessException( e );
+                       }
+                       for ( Class<? extends TOP> annotationClass : 
this.annotationClasses ) {
+                               for ( TOP annotation : Lists.newArrayList( 
JCasUtil.select( goldView, annotationClass ) ) ) {
+                                       if ( annotation.getClass().equals( 
annotationClass ) ) {
+                                               annotation.removeFromIndexes();
+                                       }
+                               }
+                       }
+                       CasCopier copier = new CasCopier( systemView.getCas(), 
goldView.getCas() );
+                       Feature sofaFeature = 
jCas.getTypeSystem().getFeatureByFullName( CAS.FEATURE_FULL_NAME_SOFA );
+                       for ( Class<? extends TOP> annotationClass : 
this.annotationClasses ) {
+                               for ( TOP annotation : JCasUtil.select( 
systemView, annotationClass ) ) {
+                                       TOP copy = (TOP)copier.copyFs( 
annotation );
+                                       if ( copy instanceof Annotation ) {
+                                               copy.setFeatureValue( 
sofaFeature, goldView.getSofa() );
+                                       }
+                                       copy.addToIndexes( goldView );
+                               }
+                       }
+               }
+       }
+
+       /*
+        * The following class overrides a ClearTK utility annotator class for 
reading
+        * a text file into a JCas. The code is copy/pasted so that one tiny 
modification
+        * can be made for this corpus -- replace a single odd character (0xc) 
with a
+        * space since it trips up xml output.
+        */
+       public static class UriToDocumentTextAnnotatorCtakes extends 
UriToDocumentTextAnnotator {
+
+               @Override
+               public void process( JCas jCas ) throws 
AnalysisEngineProcessException {
+                       URI uri = ViewUriUtil.getURI( jCas );
+                       String content;
+
+                       try {
+                               content = CharStreams.toString( new 
InputStreamReader( uri.toURL().openStream() ) );
+                               content = content.replace( (char)0xc, ' ' );
+                               jCas.setSofaDataString( content, "text/plain" );
+                       } catch ( MalformedURLException e ) {
+                               throw new AnalysisEngineProcessException( e );
+                       } catch ( IOException e ) {
+                               throw new AnalysisEngineProcessException( e );
+                       }
+               }
+       }
+
+       public static class WriteI2B2XML extends JCasAnnotator_ImplBase {
+               public static final String PARAM_OUTPUT_DIR = 
"PARAM_OUTPUT_DIR";
+               @ConfigurationParameter( mandatory = true, description = 
"Output directory to write xml files to.", name = PARAM_OUTPUT_DIR )
+               protected String outputDir;
+
+               @Override
+               public void process( JCas jcas ) throws 
AnalysisEngineProcessException {
+                       try {
+                               // get the output file name from the input file 
name and output directory.
+                               File outDir = new File( outputDir );
+                               if ( !outDir.exists() ) {
+                                       outDir.mkdirs();
+                               }
+                               File inFile = new File( ViewUriUtil.getURI( 
jcas ) );
+                               String outFile = inFile.getName().replace( 
".txt", "" );
+
+                               // build the xml
+                               DocumentBuilderFactory docFactory = 
DocumentBuilderFactory.newInstance();
+                               DocumentBuilder docBuilder = 
docFactory.newDocumentBuilder();
+                               Document doc = docBuilder.newDocument();
+                               Element rootElement = doc.createElement( 
"ClinicalNarrativeTemporalAnnotation" );
+                               Element textElement = doc.createElement( "TEXT" 
);
+                               Element tagsElement = doc.createElement( "TAGS" 
);
+                               textElement.setTextContent( 
jcas.getDocumentText() );
+                               rootElement.appendChild( textElement );
+                               rootElement.appendChild( tagsElement );
+                               doc.appendChild( rootElement );
+
+                               Map<IdentifiedAnnotation, String> argToId = new 
HashMap<>();
+                               int id = 0;
+                               for ( TimeMention timex : JCasUtil.select( 
jcas, TimeMention.class ) ) {
+                                       Element timexElement = 
doc.createElement( "TIMEX3" );
+                                       String timexID = "T" + id;
+                                       id++;
+                                       argToId.put( timex, timexID );
+                                       timexElement.setAttribute( "id", 
timexID );
+                                       timexElement.setAttribute( "start", 
String.valueOf( timex.getBegin() + 1 ) );
+                                       timexElement.setAttribute( "end", 
String.valueOf( timex.getEnd() + 1 ) );
+                                       timexElement.setAttribute( "text", 
timex.getCoveredText() );
+                                       timexElement.setAttribute( "type", "NA" 
);
+                                       timexElement.setAttribute( "val", "NA" 
);
+                                       timexElement.setAttribute( "mod", "NA" 
);
+                                       tagsElement.appendChild( timexElement );
+                               }
+
+                               id = 0;
+                               for ( EventMention event : JCasUtil.select( 
jcas, EventMention.class ) ) {
+                                       if ( event.getClass().equals( 
EventMention.class ) ) {
+                                               // this ensures we are only 
looking at THYME events and not ctakes-dictionary-lookup events
+                                               Element eventEl = 
doc.createElement( "EVENT" );
+                                               String eventID = "E" + id;
+                                               id++;
+                                               argToId.put( event, eventID );
+                                               eventEl.setAttribute( "id", 
eventID );
+                                               eventEl.setAttribute( "start", 
String.valueOf( event.getBegin() + 1 ) );
+                                               eventEl.setAttribute( "end", 
String.valueOf( event.getEnd() + 1 ) );
+                                               eventEl.setAttribute( "text", 
event.getCoveredText() );
+                                               eventEl.setAttribute( 
"modality", "NA" );
+                                               eventEl.setAttribute( 
"polarity", "NA" );
+                                               eventEl.setAttribute( "type", 
"NA" );
+                                               tagsElement.appendChild( 
eventEl );
+                                       }
+                               }
+
+                               id = 0;
+                               for ( TemporalTextRelation rel : 
JCasUtil.select( jcas, TemporalTextRelation.class ) ) {
+                                       Element linkEl = doc.createElement( 
"TLINK" );
+                                       String linkID = "TL" + id;
+                                       id++;
+                                       linkEl.setAttribute( "id", linkID );
+                                       Annotation arg1 = 
rel.getArg1().getArgument();
+                                       linkEl.setAttribute( "fromID", 
argToId.get( arg1 ) );
+                                       linkEl.setAttribute( "fromText", 
arg1.getCoveredText() );
+                                       Annotation arg2 = 
rel.getArg2().getArgument();
+                                       if ( arg2 != null ) {
+                                               linkEl.setAttribute( "toID", 
argToId.get( arg2 ) );
+                                               linkEl.setAttribute( "toText", 
arg2.getCoveredText() );
+                                       } else {
+                                               linkEl.setAttribute( "toID", 
"Discharge" );
+                                               linkEl.setAttribute( "toText", 
"Discharge" );
+                                       }
+                                       linkEl.setAttribute( "type", 
rel.getCategory() );
+                                       tagsElement.appendChild( linkEl );
+                               }
+
+                               // boilerplate xml-writing code:
+                               TransformerFactory transformerFactory = 
TransformerFactory.newInstance();
+                               Transformer transformer = 
transformerFactory.newTransformer();
+                               transformer.setOutputProperty( 
OutputKeys.INDENT, "yes" );
+                               transformer.setOutputProperty( 
OutputKeys.METHOD, "xml" );
+                               DOMSource source = new DOMSource( doc );
+                               StreamResult result = new StreamResult( new 
File( outputDir, outFile ) );
+                               transformer.transform( source, result );
+                       } catch ( ParserConfigurationException e ) {
+                               e.printStackTrace();
+                               throw new AnalysisEngineProcessException( e );
+                       } catch ( TransformerConfigurationException e ) {
+                               e.printStackTrace();
+                               throw new AnalysisEngineProcessException( e );
+                       } catch ( TransformerException e ) {
+                               e.printStackTrace();
+                               throw new AnalysisEngineProcessException( e );
+                       }
+
+               }
+
+       }
+
+       public static class WriteAnaforaXML extends JCasAnnotator_ImplBase {
+               public static final String PARAM_OUTPUT_DIR = 
"PARAM_OUTPUT_DIR";


[... 303 lines stripped ...]

svn commit: r1765496 [2/2] - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java

Reply via email to