Hi Ravi, Thank you very much for the sample code. However, in my case, the PEAR file will be deployed to a different system. Therefore, I have no access to "file.getAbsoluteFile().toURL().toString())".
I searched the uima-user mailing list archives and found an old post which was sent by Marshall Schor last year: http://mail-archives.apache.org/mod_mbox/uima-user/201205.mbox/%[email protected]%3E Within this post, CTakes was suggested. I downloaded CTakes. I tried to use org.apache.ctakes.typesystem.type.structured.DocumentID defined by cTakes. However, I can't get it working. typeSystemDescriptor.xml: <?xml version="1.0" encoding="UTF-8"?> -<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier"> <name>typeSystemDescriptor</name> <description/> <version>1.0</version> <vendor/> -<imports> <import name="org.apache.ctakes.typesystem.types.TypeSystem"/> </imports> -<types> -<typeDescription> <name>uima.TestThirdPartyLib</name> <description/> <supertypeName>uima.tcas.Annotation</supertypeName> </typeDescription> </types> </typeSystemDescription> TestThirdPartyLib.xml (my annotation which uses org.apache.ctakes.typesystem.type.structured.DocumentID as input) <?xml version="1.0" encoding="UTF-8"?> -<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier"> <frameworkImplementation>org.apache.uima.java</frameworkImplementation> <primitive>true</primitive> <annotatorImplementationName>annotators.TestThirdPartyLibDescriptor</annotatorImplementationName> -<analysisEngineMetaData> <name>TestThirdPartyLibDescriptor</name> <description/> <version>1.0</version> <vendor/> <configurationParameters/> <configurationParameterSettings/> -<typeSystemDescription> -<imports> <import name="org.apache.ctakes.typesystem.types.TypeSystem"/> </imports> -<types> -<typeDescription> <name>uima.TestThirdPartyLib</name> <description/> <supertypeName>uima.tcas.Annotation</supertypeName> </typeDescription> </types> </typeSystemDescription> <typePriorities/> <fsIndexCollection/> -<capabilities> -<capability> -<inputs> <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.structured.DocumentID</type> </inputs> -<outputs> <type allAnnotatorFeatures="true">uima.TestThirdPartyLib</type> </outputs> <languagesSupported/> </capability> </capabilities> -<operationalProperties> <modifiesCas>true</modifiesCas> <multipleDeploymentAllowed>true</multipleDeploymentAllowed> <outputsNewCASes>false</outputsNewCASes> </operationalProperties> </analysisEngineMetaData> <resourceManagerConfiguration/> </analysisEngineDescription> TestThirdPartyLibDescriptor.java: package annotators; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.LinkedList; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import uima.TestThirdPartyLib; import org.apache.ctakes.typesystem.type.structured.DocumentID; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.AnalysisComponent; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JFSIndexRepository; import org.apache.uima.jcas.cas.TOP; import org.apache.uima.resource.ResourceInitializationException; /** * Test annotation */ public class TestThirdPartyLibDescriptor extends JCasAnnotator_ImplBase { /** * @see AnalysisComponent#initialize(UimaContext) */ public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); } /** * @see JCasAnnotator_ImplBase#process(JCas) */ public void process(JCas aJCas) { String docText = aJCas.getDocumentText(); test(aJCas); System.out.println("Say something"); } private void test(JCas aJCas) { //System.out.println("Full text:*"+aJCas.getDocumentText()+"*"); JFSIndexRepository indexes = aJCas.getJFSIndexRepository(); FSIterator<TOP> documentIDIterator = indexes.getAllIndexedFS(DocumentID.type); while (documentIDIterator.isValid()) { DocumentID documentIDAnnotation = (DocumentID) documentIDIterator.next(); String documentID = documentIDAnnotation.getDocumentID(); System.out.println("DocumentID: "+documentID); } //create an annotation TestThirdPartyLib annotation = new TestThirdPartyLib(aJCas); //annotation.setBegin(la.begin()); //annotation.setEnd(la.end()); annotation.addToIndexes(); } } TestMain.java import uima.*; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; import org.apache.uima.cas.FSIndex; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; import java.io.File; import java.io.BufferedReader; import java.io.FileReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.regex.Pattern; public class TestMain { static String readFile(File infile) throws Exception { //read file BufferedReader reader = new BufferedReader( new FileReader(infile)); StringBuffer fileData = new StringBuffer(); char[] buf = new char[1024]; int numRead=0; while((numRead=reader.read(buf)) != -1){ String readData = String.valueOf(buf, 0, numRead); fileData.append(readData); } reader.close(); return fileData.toString(); } public static void main(String[] args) throws Exception { try { System.out.println("Say something"); File aeFile = new File("desc/TestThirdPartyLibDescriptor.xml"); XMLInputSource in = new XMLInputSource(aeFile); ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in); AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier); JCas jcas = ae.newJCas(); File inputFileFolder = new File("data"); int count = 0; for (final File fileEntry : inputFileFolder.listFiles()) { if (fileEntry.isDirectory()) { continue; } else { //if (fileEntry.getName().indexOf(filename)!=-1) { //System.out.println(count+": "+fileEntry.getName()); String filecontent = TestMain.readFile(fileEntry); //analyze a document jcas.setDocumentText(filecontent); ae.process(jcas); jcas.reset(); count += 1; //break; } } } } catch(Exception e) { e.printStackTrace(); } } } It seems to be silly to use cTakes just using it for getting the file name. However, I really need to get the file name as it is the only way to identify a file. Can anyone tell me what I did wrong so org.apache.ctakes.typesystem.type.structured.DocumentID doesn't work? Any help and suggest will be greatly appreciated! Thank you! Regards, Debbie Zhang > -----Original Message----- > From: Ravindra [mailto:[email protected]] > Sent: Thursday, 10 July 2014 9:39 PM > To: [email protected] > Cc: [email protected] > Subject: Re: Read file name in an annotator > > May this help - > > // Also store location of source document in CAS. This information > is critical > // if CAS Consumers will need to know where the original document > contents are located. > // For example, the Semantic Search CAS Indexer writes this > information into the > // search index that it creates, which allows applications that use > the search index to > // locate the documents that satisfy their semantic queries. > SourceDocumentInformation srcDocInfo = new > SourceDocumentInformation(jcas); > srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString()); > srcDocInfo.setOffsetInSource(0); > srcDocInfo.setDocumentSize((int) file.length()); > srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size()); > srcDocInfo.addToIndexes(); > > > followed by > // retrieve the filename of the input file from the CAS > FSIterator it = > jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator(); > File outFile = null; > if (it.hasNext()) { > SourceDocumentInformation fileLoc = (SourceDocumentInformation) > it.next(); > File inFile; > try { > inFile = new File(new URL(fileLoc.getUri()).getPath()); > String outFileName = inFile.getName(); > if (fileLoc.getOffsetInSource() > 0) { > outFileName += ("_" + fileLoc.getOffsetInSource()); > } > outFileName += ".xmi"; > outFile = new File(mOutputDir, outFileName); > modelFileName = mOutputDir.getAbsolutePath() + "/" + > inFile.getName() + ".ecore"; > } catch (MalformedURLException e1) { > // invalid URL, use default processing below > } > } > > look for SourceDocumentInformation in the examples > > > -- > Ravi. > *''We do not inherit the earth from our ancestors, we borrow it from > our children.'' PROTECT IT !* > > > On Thu, Jul 10, 2014 at 4:49 PM, Debbie Zhang <[email protected]> > wrote: > > > Thanks Thomas. May I ask if there is any sample code of UIMA readers > > that can provide file name information for developing annotation? I > > was looking on the internet today, but couldn't find one. Thanks > again > > for your help - much appreciated! > > > > Regards, > > > > Debbie Zhang > > > > > -----Original Message----- > > > From: Thomas Ginter [mailto:[email protected]] > > > Sent: Thursday, 10 July 2014 5:00 AM > > > To: [email protected] > > > Subject: Re: Read file name in an annotator > > > > > > Hi Debbie, > > > > > > The file name is not provided by default in UIMA although I believe > > > the UIMA FileReader does populate a SourceDocumentInformation > > > annotation with this information. Our group has a set of readers > > > that populate our own annotation type to provide location data and > > > other meta- information for each record (CAS) being processed. In > > > short you will be better off writing your reader to provide that > information for you. > > > > > > Thanks, > > > > > > Thomas Ginter > > > 801-448-7676 > > > [email protected] > > > > > > > > > > > > > > > On Jul 9, 2014, at 5:41, Debbie Zhang <[email protected]> > wrote: > > > > > > > Hi, > > > > > > > > Can anyone tell me how to read the file name in an annotator > using > > > the > > > > JCas? It seems the DocumentAnnotation does't contain file name. > > > > Thank you! > > > > > > > > Best regards, > > > > > > > > Debbie Zhang > > > > > >
<?xml version="1.0" encoding="UTF-8"?> <analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier"> <frameworkImplementation>org.apache.uima.java</frameworkImplementation> <primitive>true</primitive> <annotatorImplementationName>annotators.TestThirdPartyLibDescriptor</annotatorImplementationName> <analysisEngineMetaData> <name>TestThirdPartyLibDescriptor</name> <description/> <version>1.0</version> <vendor/> <configurationParameters/> <configurationParameterSettings/> <typeSystemDescription> <imports> <import name="org.apache.ctakes.typesystem.types.TypeSystem"/> </imports> <types> <typeDescription> <name>uima.TestThirdPartyLib</name> <description/> <supertypeName>uima.tcas.Annotation</supertypeName> </typeDescription> </types> </typeSystemDescription> <typePriorities/> <fsIndexCollection/> <capabilities> <capability> <inputs> <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.structured.DocumentID</type> </inputs> <outputs> <type allAnnotatorFeatures="true">uima.TestThirdPartyLib</type> </outputs> <languagesSupported/> </capability> </capabilities> <operationalProperties> <modifiesCas>true</modifiesCas> <multipleDeploymentAllowed>true</multipleDeploymentAllowed> <outputsNewCASes>false</outputsNewCASes> </operationalProperties> </analysisEngineMetaData> <resourceManagerConfiguration/> </analysisEngineDescription>
<?xml version="1.0" encoding="utf-8"?> <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier"> <name>typeSystemDescriptor</name> <description /> <version>1.0</version> <vendor /> <imports> <import name="org.apache.ctakes.typesystem.types.TypeSystem" /> </imports> <types> <typeDescription> <name>uima.TestThirdPartyLib</name> <description /> <supertypeName>uima.tcas.Annotation</supertypeName> </typeDescription> </types> </typeSystemDescription>
