Hi, I tried to make a review on review board, but I got an error: "The specified diff file could not be parsed. Line 8: No valid separator after the filename was found in the diff header" I can try tomorrow, to make a github pull request if the problem doesn't have a solution. I attached the diff file in the meantime. Waiting for your suggestions. Thanks, Radu.
2015-04-22 3:27 GMT+03:00 Chris Mattmann <[email protected]>: > Yep and if you want to try Github, you can also do: > > http://github.com/apache/oodt/#contributing > > Pull request is fine after opening JIRA issue, or Review > Board as Lewis mentioned. > > Cheers! > Chris > > ------------------------ > Chris Mattmann > [email protected] > > > > > -----Original Message----- > From: Lewis John Mcgibbney <[email protected]> > Reply-To: <[email protected]> > Date: Tuesday, April 21, 2015 at 6:47 PM > To: "[email protected]" <[email protected]> > Subject: Re: GSoC 2015 - Replace OODT's XMLPRC with Avro's RPC > > >Hey Radu, > > > >Good work :) > > > >On Tue, Apr 21, 2015 at 3:02 PM, Radu Manole <[email protected]> > >wrote: > > > >> Hi Chris, > >> I made a Tika extractor for the file manager. How should I post the > >>diff? A > >> github pull request or attach a diff file to the jira item? > >> > > > >Either or. Typically people here quite like review boards as well. Can you > >plea open one and link to it from the OODT Jira issue? > >http://reviews.apache.org > > > > > >> Also, is there a code conventions page? > > > > > > > https://cwiki.apache.org/confluence/display/OODT/Getting+started+with+Apac > >he+OODT#GettingstartedwithApacheOODT-StepThree:UsingtheJIRAandDeveloping\ > >Thanks > >Lewis > > >
commit 8caf57f0a70df241306a7b2da6d27c72f675b90b Author: Radu Manole <[email protected]> Date: Wed Apr 22 23:29:58 2015 +0300 Fix for OODT-829 - AutoDetect Tika Extractor. diff --git a/filemgr/pom.xml b/filemgr/pom.xml index 4299e75..c89e8fe 100644 --- a/filemgr/pom.xml +++ b/filemgr/pom.xml @@ -151,6 +151,11 @@ </repositories> <dependencies> <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>1.7</version> + </dependency> + <dependency> <groupId>org.apache.oodt</groupId> <artifactId>oodt-commons</artifactId> <version>${project.parent.version}</version> diff --git a/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java b/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java new file mode 100644 index 0000000..0a28ea0 --- /dev/null +++ b/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.oodt.cas.filemgr.metadata.extractors.examples; + +//JDK imports +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Arrays; + +//OODT imports +import org.apache.oodt.cas.filemgr.metadata.extractors.AbstractFilemgrMetExtractor; +import org.apache.oodt.cas.filemgr.structs.Product; +import org.apache.oodt.cas.metadata.Metadata; +import org.apache.oodt.cas.metadata.exceptions.MetExtractionException; + +//TIKA imports +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.*; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + + +public class TikaAutoDetectExtractor extends AbstractFilemgrMetExtractor { + + public void doConfigure() { + } + + public Metadata doExtract(Product product, Metadata met) throws MetExtractionException { + Metadata outMetadata = new Metadata(); + + merge(met, outMetadata); + Metadata tikaMetadata = getMetadataFromTika(product); + merge(tikaMetadata, outMetadata); + + return outMetadata; + } + + private Metadata getMetadataFromTika(Product product) throws MetExtractionException { + try { + File file = getProductFile(product); + FileInputStream inputStream = new FileInputStream(file); + org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata(); + Parser parser = new AutoDetectParser(); + parser.parse(inputStream, new DefaultHandler(), tikaMetadata, new ParseContext()); + return transform(tikaMetadata); + + } catch (FileNotFoundException e) { + throw new MetExtractionException( + "Unable to find file: Reason: " + e.getMessage()); + } catch (TikaException e) { + throw new MetExtractionException( + "Unable to parse the document: Reason: " + e.getMessage()); + } catch (SAXException e) { + throw new MetExtractionException( + " Unable to process the SAX events : Reason: " + e.getMessage()); + } catch (IOException e) { + throw new MetExtractionException( + "Unable to read the document stream: Reason: " + e.getMessage()); + } + } + + private Metadata transform(org.apache.tika.metadata.Metadata tikaMetadata){ + Metadata metadata = new Metadata(); + + String[] names = tikaMetadata.names(); + for (String name : names){ + metadata.addMetadata(name, Arrays.asList(tikaMetadata.getValues(name))); + } + + return metadata; + } +} diff --git a/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java b/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java new file mode 100644 index 0000000..8fc2d31 --- /dev/null +++ b/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.oodt.cas.filemgr.metadata.extractors.examples; + +//JDK imports +import java.net.URL; + +//Junit imports +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +// JUnit static imports +import static junit.framework.Assert.*; + +//OODT imports +import org.apache.oodt.cas.filemgr.structs.Product; +import org.apache.oodt.cas.filemgr.structs.Reference; +import org.apache.oodt.cas.metadata.Metadata; +import org.apache.oodt.cas.metadata.exceptions.MetExtractionException; + + +@RunWith(JUnit4.class) +public class TestTikaAutoDetectExtractor { + + @Test + public void test() throws MetExtractionException { + TikaAutoDetectExtractor tikaExtractor = new TikaAutoDetectExtractor(); + + Metadata emptyMetadata = new Metadata(); + Reference ref = new Reference(); + URL file = this.getClass().getResource("/test.txt"); + ref.setOrigReference(file.toString()); + ref.setDataStoreReference(file.toString()); + + Product product = new Product(); + product.getProductReferences().add(ref); + product.setProductStructure(Product.STRUCTURE_FLAT); + + Metadata outputMetadata = tikaExtractor.doExtract(product, emptyMetadata); + + assertNotNull(outputMetadata); + assertTrue(outputMetadata.getAllKeys().size() > 0); + assertTrue(outputMetadata.containsKey("X-Parsed-By")); + assertFalse(outputMetadata.getMetadata("X-Parsed-By") == "org.apache.tika.parser.EmptyParser"); + } +} + + +
