RE: Read file name in an annotator

Debbie Zhang Mon, 14 Jul 2014 02:03:30 -0700

Hi Ravi,

Thank you very much for the sample code. However, in my case, the PEAR file 
will be deployed to a different system. Therefore, I have no access to 
"file.getAbsoluteFile().toURL().toString())".


I searched the uima-user mailing list archives and found an old post which was 
sent by Marshall Schor last year:
http://mail-archives.apache.org/mod_mbox/uima-user/201205.mbox/%[email protected]%3E
 Within this post, CTakes was suggested. I downloaded CTakes. I tried to use 
org.apache.ctakes.typesystem.type.structured.DocumentID defined by cTakes. 
However, I can't get it working.

typeSystemDescriptor.xml:
<?xml version="1.0" encoding="UTF-8"?>
-<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier";>
<name>typeSystemDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
-<imports>
<import name="org.apache.ctakes.typesystem.types.TypeSystem"/>
</imports>
-<types>
-<typeDescription>
<name>uima.TestThirdPartyLib</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
</types>
</typeSystemDescription>

TestThirdPartyLib.xml (my annotation which uses 
org.apache.ctakes.typesystem.type.structured.DocumentID as input) 
<?xml version="1.0" encoding="UTF-8"?>
-<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier";>
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>annotators.TestThirdPartyLibDescriptor</annotatorImplementationName>
-<analysisEngineMetaData>
<name>TestThirdPartyLibDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters/>
<configurationParameterSettings/>
-<typeSystemDescription>
-<imports>
<import name="org.apache.ctakes.typesystem.types.TypeSystem"/>
</imports>
-<types>
-<typeDescription>
<name>uima.TestThirdPartyLib</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
-<capabilities>
-<capability>
-<inputs>
<type 
allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.structured.DocumentID</type>
</inputs>
-<outputs>
<type allAnnotatorFeatures="true">uima.TestThirdPartyLib</type>
</outputs>
<languagesSupported/>
</capability>
</capabilities>
-<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

TestThirdPartyLibDescriptor.java:
package annotators;

import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.LinkedList;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import uima.TestThirdPartyLib;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.AnalysisComponent;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.resource.ResourceInitializationException;

/**
 * Test annotation
 */
public class TestThirdPartyLibDescriptor extends JCasAnnotator_ImplBase
{
        /**
         * @see AnalysisComponent#initialize(UimaContext)
         */
        public void initialize(UimaContext aContext) throws 
ResourceInitializationException {
                super.initialize(aContext);
  }
  /**
   * @see JCasAnnotator_ImplBase#process(JCas)
   */
  public void process(JCas aJCas) {
                
          String docText = aJCas.getDocumentText();
          test(aJCas);
          
          System.out.println("Say something");
  }
  private void test(JCas aJCas)
  {  
         //System.out.println("Full text:*"+aJCas.getDocumentText()+"*");
                        
         JFSIndexRepository indexes = aJCas.getJFSIndexRepository();
         FSIterator<TOP> documentIDIterator = 
indexes.getAllIndexedFS(DocumentID.type);
         while (documentIDIterator.isValid()) {
                 DocumentID documentIDAnnotation = (DocumentID) 
documentIDIterator.next();
                 String documentID = documentIDAnnotation.getDocumentID();
                 System.out.println("DocumentID: "+documentID);
         }
                
         //create an annotation 
         TestThirdPartyLib annotation = new TestThirdPartyLib(aJCas);
         //annotation.setBegin(la.begin());
         //annotation.setEnd(la.end());
         annotation.addToIndexes();
  }
}

TestMain.java

import uima.*;

import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.jcas.JCas;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;

import java.io.File;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Pattern;

public class TestMain {
        
        static String readFile(File infile) throws Exception
        {
                //read file
        BufferedReader reader = new BufferedReader(
                new FileReader(infile));
        StringBuffer fileData = new StringBuffer();
        char[] buf = new char[1024];
        int numRead=0;
        while((numRead=reader.read(buf)) != -1){
                String readData = String.valueOf(buf, 0, numRead);
                fileData.append(readData);
            }
        reader.close();
        
        return fileData.toString();
        }

        public static void main(String[] args) throws Exception
        {       
                try {
                System.out.println("Say something");
                File aeFile = new File("desc/TestThirdPartyLibDescriptor.xml");
                XMLInputSource in = new XMLInputSource(aeFile);
                ResourceSpecifier specifier =
                UIMAFramework.getXMLParser().parseResourceSpecifier(in);
                AnalysisEngine ae = 
UIMAFramework.produceAnalysisEngine(specifier);

                JCas jcas = ae.newJCas();
                File inputFileFolder = new File("data");
                int count = 0;
                for (final File fileEntry : inputFileFolder.listFiles()) {
                if (fileEntry.isDirectory()) {
                    continue;
                } else 
                {       
                        //if (fileEntry.getName().indexOf(filename)!=-1)
                        {
                                //System.out.println(count+": 
"+fileEntry.getName());
                                String filecontent = 
TestMain.readFile(fileEntry);
                        
                                //analyze a document
                                jcas.setDocumentText(filecontent);
                                ae.process(jcas);
                                                        
                                jcas.reset();                   
                                count += 1;
                                //break;
                        }
                }
                }
        } catch(Exception e) {
                e.printStackTrace();
        }
        }
}

It seems to be silly to use cTakes just using it for getting the file name. 
However, I really need to get the file name as it is the only way to identify a 
file. Can anyone tell me what I did wrong so 
org.apache.ctakes.typesystem.type.structured.DocumentID doesn't work?

Any help and suggest will be greatly appreciated! Thank you!

Regards,

Debbie Zhang
 
> -----Original Message-----
> From: Ravindra [mailto:[email protected]]
> Sent: Thursday, 10 July 2014 9:39 PM
> To: [email protected]
> Cc: [email protected]
> Subject: Re: Read file name in an annotator
> 
> May this help -
> 
>     // Also store location of source document in CAS. This information
> is critical
>     // if CAS Consumers will need to know where the original document
> contents are located.
>     // For example, the Semantic Search CAS Indexer writes this
> information into the
>     // search index that it creates, which allows applications that use
> the search index to
>     // locate the documents that satisfy their semantic queries.
>     SourceDocumentInformation srcDocInfo = new
> SourceDocumentInformation(jcas);
>     srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
>     srcDocInfo.setOffsetInSource(0);
>     srcDocInfo.setDocumentSize((int) file.length());
>     srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
>     srcDocInfo.addToIndexes();
> 
> 
> followed by
>    // retrieve the filename of the input file from the CAS
>     FSIterator it =
> jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
>     File outFile = null;
>     if (it.hasNext()) {
>       SourceDocumentInformation fileLoc = (SourceDocumentInformation)
> it.next();
>       File inFile;
>       try {
>         inFile = new File(new URL(fileLoc.getUri()).getPath());
>         String outFileName = inFile.getName();
>         if (fileLoc.getOffsetInSource() > 0) {
>           outFileName += ("_" + fileLoc.getOffsetInSource());
>         }
>         outFileName += ".xmi";
>         outFile = new File(mOutputDir, outFileName);
>         modelFileName = mOutputDir.getAbsolutePath() + "/" +
> inFile.getName() + ".ecore";
>       } catch (MalformedURLException e1) {
>         // invalid URL, use default processing below
>       }
>     }
> 
> look for SourceDocumentInformation in the examples
> 
> 
> --
> Ravi.
> *''We do not inherit the earth from our ancestors, we borrow it from
> our children.'' PROTECT IT !*
> 
> 
> On Thu, Jul 10, 2014 at 4:49 PM, Debbie Zhang <[email protected]>
> wrote:
> 
> > Thanks Thomas. May I ask if there is any sample code of UIMA readers
> > that can provide file name information for developing annotation? I
> > was looking on the internet today, but couldn't find one. Thanks
> again
> > for your help - much appreciated!
> >
> > Regards,
> >
> > Debbie Zhang
> >
> > > -----Original Message-----
> > > From: Thomas Ginter [mailto:[email protected]]
> > > Sent: Thursday, 10 July 2014 5:00 AM
> > > To: [email protected]
> > > Subject: Re: Read file name in an annotator
> > >
> > > Hi Debbie,
> > >
> > > The file name is not provided by default in UIMA although I believe
> > > the UIMA FileReader does populate a SourceDocumentInformation
> > > annotation with this information.  Our group has a set of readers
> > > that populate our own annotation type to provide location data and
> > > other meta- information for each record (CAS) being processed.  In
> > > short you will be better off writing your reader to provide that
> information for you.
> > >
> > > Thanks,
> > >
> > > Thomas Ginter
> > > 801-448-7676
> > > [email protected]
> > >
> > >
> > >
> > >
> > > On Jul 9, 2014, at 5:41, Debbie Zhang <[email protected]>
> wrote:
> > >
> > > > Hi,
> > > >
> > > > Can anyone tell me how to read the file name in an annotator
> using
> > > the
> > > > JCas? It seems the DocumentAnnotation does't contain file name.
> > > > Thank you!
> > > >
> > > > Best regards,
> > > >
> > > > Debbie Zhang
> >
> >
> >

<?xml version="1.0" encoding="UTF-8"?>
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier";>
  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
  <primitive>true</primitive>  <annotatorImplementationName>annotators.TestThirdPartyLibDescriptor</annotatorImplementationName>
  <analysisEngineMetaData>
    <name>TestThirdPartyLibDescriptor</name>
    <description/>
    <version>1.0</version>
    <vendor/>
    <configurationParameters/>
    <configurationParameterSettings/>
    <typeSystemDescription>
      <imports>
        <import name="org.apache.ctakes.typesystem.types.TypeSystem"/>
      </imports>
      <types>
        <typeDescription>
          <name>uima.TestThirdPartyLib</name>
          <description/>
          <supertypeName>uima.tcas.Annotation</supertypeName>
        </typeDescription>
      </types>
    </typeSystemDescription>
    <typePriorities/>
    <fsIndexCollection/>
    <capabilities>
      <capability>
        <inputs>
          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.structured.DocumentID</type>
        </inputs>
        <outputs>
          <type allAnnotatorFeatures="true">uima.TestThirdPartyLib</type>
        </outputs>
        <languagesSupported/>
      </capability>
    </capabilities>
  <operationalProperties>
      <modifiesCas>true</modifiesCas>
      <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
      <outputsNewCASes>false</outputsNewCASes>
    </operationalProperties>
  </analysisEngineMetaData>
  <resourceManagerConfiguration/>
</analysisEngineDescription>

ï»¿<?xml version="1.0" encoding="utf-8"?>
<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier";>
  <name>typeSystemDescriptor</name>
  <description />
  <version>1.0</version>
  <vendor />
  <imports>
    <import name="org.apache.ctakes.typesystem.types.TypeSystem" />
  </imports>
  <types>
    <typeDescription>
      <name>uima.TestThirdPartyLib</name>
      <description />
      <supertypeName>uima.tcas.Annotation</supertypeName>
    </typeDescription>
  </types>
</typeSystemDescription>

RE: Read file name in an annotator

Reply via email to