Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.geoinfo; + +import org.apache.sis.internal.util.CheckedArrayList; +import org.apache.sis.internal.util.CheckedHashSet; +import org.apache.sis.metadata.iso.DefaultMetadata; +import org.apache.sis.metadata.iso.DefaultMetadataScope; +import org.apache.sis.metadata.iso.constraint.DefaultLegalConstraints; +import org.apache.sis.metadata.iso.extent.DefaultGeographicBoundingBox; +import org.apache.sis.metadata.iso.extent.DefaultGeographicDescription; +import org.apache.sis.metadata.iso.identification.DefaultDataIdentification; +import org.apache.sis.storage.DataStore; +import org.apache.sis.storage.DataStoreException; +import org.apache.sis.storage.DataStores; +import org.apache.sis.storage.UnsupportedStorageException; +import org.apache.sis.util.collection.CodeListSet; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.opengis.metadata.Identifier; +import org.opengis.metadata.citation.Citation; +import org.opengis.metadata.citation.CitationDate; +import org.opengis.metadata.citation.OnlineResource; +import org.opengis.metadata.citation.ResponsibleParty; +import org.opengis.metadata.constraint.Restriction; +import org.opengis.metadata.distribution.DigitalTransferOptions; +import org.opengis.metadata.distribution.Distribution; +import org.opengis.metadata.distribution.Distributor; +import org.opengis.metadata.distribution.Format; +import org.opengis.metadata.extent.Extent; +import org.opengis.metadata.extent.GeographicExtent; +import org.opengis.metadata.identification.Identification; +import org.opengis.metadata.identification.Keywords; +import org.opengis.metadata.identification.Progress; +import org.opengis.metadata.identification.TopicCategory; +import org.opengis.util.InternationalString; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.*; + + +public class GeographicInformationParser extends AbstractParser{ + + public static final String geoInfoType="text/iso19139+xml"; + private final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.text("iso19139+xml")); + + + @Override + public Set<MediaType> getSupportedTypes(ParseContext parseContext) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { + metadata.set(Metadata.CONTENT_TYPE,geoInfoType); + DataStore dataStore= null; + DefaultMetadata defaultMetadata=null; + XHTMLContentHandler xhtmlContentHandler=new XHTMLContentHandler(contentHandler,metadata); + + try { + TemporaryResources tmp = new TemporaryResources(); + TikaInputStream tikaInputStream=TikaInputStream.get(inputStream,tmp); + File file= tikaInputStream.getFile(); + dataStore = DataStores.open(file); + defaultMetadata=new DefaultMetadata(dataStore.getMetadata()); + if(defaultMetadata!=null) + extract(xhtmlContentHandler, metadata, defaultMetadata); + + }catch (UnsupportedStorageException e) { + throw new TikaException("UnsupportedStorageException",e); + } + catch (DataStoreException e) { + throw new TikaException("DataStoreException",e); + } + } + + private void extract(XHTMLContentHandler xhtmlContentHandler,Metadata metadata, DefaultMetadata defaultMetadata) throws SAXException{ + try { + getMetaDataCharacterSet(metadata, defaultMetadata); + getMetaDataContact(metadata, defaultMetadata); + getMetaDataIdentificationInfo(metadata, defaultMetadata); + getMetaDataDistributionInfo(metadata, defaultMetadata); + getMetaDataDateInfo(metadata, defaultMetadata); + getMetaDataResourceScope(metadata, defaultMetadata); + getMetaDataParentMetaDataTitle(metadata, defaultMetadata); + getMetaDataIdetifierCode(metadata, defaultMetadata); + getMetaDataStandard(metadata, defaultMetadata); + extractContent(xhtmlContentHandler, defaultMetadata); + } + catch(Exception e){ + e.printStackTrace(); + } + } + + private void extractContent(XHTMLContentHandler xhtmlContentHandler, DefaultMetadata defaultMetadata) throws SAXException{ + xhtmlContentHandler.startDocument(); + xhtmlContentHandler.newline(); + + xhtmlContentHandler.newline(); + ArrayList<Identification> identifications= (ArrayList<Identification>) defaultMetadata.getIdentificationInfo(); + for(Identification i:identifications) { + xhtmlContentHandler.startElement("h1"); + xhtmlContentHandler.characters(i.getCitation().getTitle().toString()); + xhtmlContentHandler.endElement("h1"); + xhtmlContentHandler.newline(); + + ArrayList<ResponsibleParty> responsiblePartyArrayList = (ArrayList<ResponsibleParty>) i.getCitation().getCitedResponsibleParties(); + for (ResponsibleParty r : responsiblePartyArrayList) { + xhtmlContentHandler.startElement("h3"); + xhtmlContentHandler.newline(); + xhtmlContentHandler.characters("CitedResponsiblePartyRole " + r.getRole().toString()); + xhtmlContentHandler.characters("CitedResponsiblePartyName " + r.getIndividualName().toString()); + xhtmlContentHandler.endElement("h3"); + xhtmlContentHandler.newline(); + } + + xhtmlContentHandler.startElement("p"); + xhtmlContentHandler.newline(); + xhtmlContentHandler.characters("IdentificationInfoAbstract " + i.getAbstract().toString()); + xhtmlContentHandler.endElement("p"); + xhtmlContentHandler.newline(); + Collection<Extent> extentList=((DefaultDataIdentification) i).getExtents(); + for(Extent e:extentList){ + ArrayList<GeographicExtent> geoElements= (ArrayList<GeographicExtent>) e.getGeographicElements(); + for(GeographicExtent g:geoElements) { + + if (g instanceof DefaultGeographicBoundingBox) { + xhtmlContentHandler.startElement("tr"); + xhtmlContentHandler.startElement("td"); + xhtmlContentHandler.characters("GeographicElementWestBoundLatitude"); + xhtmlContentHandler.endElement("td"); + xhtmlContentHandler.startElement("td"); + xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getWestBoundLongitude())); + xhtmlContentHandler.endElement("td"); + xhtmlContentHandler.endElement("tr"); + xhtmlContentHandler.startElement("tr"); + xhtmlContentHandler.startElement("td"); + xhtmlContentHandler.characters("GeographicElementEastBoundLatitude"); + xhtmlContentHandler.endElement("td"); + xhtmlContentHandler.startElement("td"); + xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getEastBoundLongitude())); + xhtmlContentHandler.endElement("td"); + xhtmlContentHandler.endElement("tr"); + xhtmlContentHandler.startElement("tr"); + xhtmlContentHandler.startElement("td"); + xhtmlContentHandler.characters("GeographicElementNorthBoundLatitude"); + xhtmlContentHandler.endElement("td"); + xhtmlContentHandler.startElement("td"); + xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getNorthBoundLatitude())); + xhtmlContentHandler.endElement("td"); + xhtmlContentHandler.endElement("tr"); + xhtmlContentHandler.startElement("tr"); + xhtmlContentHandler.startElement("td"); + xhtmlContentHandler.characters("GeographicElementSouthBoundLatitude"); + xhtmlContentHandler.endElement("td"); + xhtmlContentHandler.startElement("td"); + xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getSouthBoundLatitude())); + xhtmlContentHandler.endElement("td"); + xhtmlContentHandler.endElement("tr"); + } + } + } + } + xhtmlContentHandler.newline(); + xhtmlContentHandler.endDocument(); + } + + private void getMetaDataCharacterSet(Metadata metadata, DefaultMetadata defaultMetaData){ + CheckedHashSet<Charset> charSetList= (CheckedHashSet<Charset>) defaultMetaData.getCharacterSets(); + for(Charset c:charSetList){ + metadata.add("CharacterSet",c.name()); + } + } + + + private void getMetaDataContact(Metadata metadata, DefaultMetadata defaultMetaData){ + CheckedArrayList<ResponsibleParty> contactSet= (CheckedArrayList<ResponsibleParty>) defaultMetaData.getContacts(); + for(ResponsibleParty rparty:contactSet){ + if(rparty.getRole()!=null) + metadata.add("ContactRole",rparty.getRole().name()); + if(rparty.getOrganisationName()!=null) + metadata.add("ContactPartyName-",rparty.getOrganisationName().toString()); + } + } + + private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata defaultMetaData){ + ArrayList<Identification> identifications= (ArrayList<Identification>) defaultMetaData.getIdentificationInfo(); + for(Identification i:identifications){ + DefaultDataIdentification defaultDataIdentification= (DefaultDataIdentification) i; + if(i.getCitation()!=null && i.getCitation().getTitle()!=null) + metadata.add("IdentificationInfoCitationTitle ",i.getCitation().getTitle().toString()); + + ArrayList<CitationDate> dateArrayList= (ArrayList<CitationDate>) i.getCitation().getDates(); + for (CitationDate d:dateArrayList){ + if(d.getDateType()!=null) + metadata.add("CitationDate ",d.getDateType().name()+"-->"+d.getDate()); + } + ArrayList<ResponsibleParty> responsiblePartyArrayList= (ArrayList<ResponsibleParty>) i.getCitation().getCitedResponsibleParties(); + for(ResponsibleParty r:responsiblePartyArrayList){ + if(r.getRole()!=null) + metadata.add("CitedResponsiblePartyRole ",r.getRole().toString()); + if(r.getIndividualName()!=null) + metadata.add("CitedResponsiblePartyName ",r.getIndividualName().toString()); + if(r.getOrganisationName()!=null) + metadata.add("CitedResponsiblePartyOrganizationName ", r.getOrganisationName().toString()); + if(r.getPositionName()!=null) + metadata.add("CitedResponsiblePartyPositionName ",r.getPositionName().toString()); + + if(r.getContactInfo()!=null){ + for(String s:r.getContactInfo().getAddress().getElectronicMailAddresses()) { + metadata.add("CitedResponsiblePartyEMail ",s.toString()); + } + } + } + if(i.getAbstract()!=null) + metadata.add("IdentificationInfoAbstract ",i.getAbstract().toString()); + for(Progress p:i.getStatus()) { + metadata.add("IdentificationInfoStatus ",p.name()); + } + ArrayList<Format> formatArrayList= (ArrayList<Format>) i.getResourceFormats(); + for(Format f:formatArrayList){ + if(f.getName()!=null) + metadata.add("ResourceFormatSpecificationAlternativeTitle ",f.getName().toString()); + } + CheckedHashSet<Locale> localeCheckedHashSet= (CheckedHashSet<Locale>) defaultDataIdentification.getLanguages(); + for(Locale l:localeCheckedHashSet){ + metadata.add("IdentificationInfoLanguage-->",l.getDisplayLanguage(Locale.ENGLISH)); + } + CodeListSet<TopicCategory> categoryList= (CodeListSet<TopicCategory>) defaultDataIdentification.getTopicCategories(); + for(TopicCategory t:categoryList){ + metadata.add("IdentificationInfoTopicCategory-->",t.name()); + } + ArrayList<Keywords> keywordList= (ArrayList<Keywords>) i.getDescriptiveKeywords(); + int j=1; + for(Keywords k:keywordList){ + j++; + ArrayList<InternationalString> stringList= (ArrayList<InternationalString>) k.getKeywords(); + for(InternationalString s:stringList){ + metadata.add("Keywords "+j ,s.toString()); + } + if(k.getType()!=null) + metadata.add("KeywordsType "+j,k.getType().name()); + if(k.getThesaurusName()!=null && k.getThesaurusName().getTitle()!=null) + metadata.add("ThesaurusNameTitle "+j,k.getThesaurusName().getTitle().toString()); + if(k.getThesaurusName()!=null && k.getThesaurusName().getAlternateTitles()!=null) + metadata.add("ThesaurusNameAlternativeTitle "+j,k.getThesaurusName().getAlternateTitles().toString()); + + ArrayList<CitationDate>citationDates= (ArrayList<CitationDate>) k.getThesaurusName().getDates(); + for(CitationDate cd:citationDates) { + if(cd.getDateType()!=null) + metadata.add("ThesaurusNameDate ",cd.getDateType().name() +"-->" + cd.getDate()); + } + } + ArrayList<DefaultLegalConstraints> constraintList= (ArrayList<DefaultLegalConstraints>) i.getResourceConstraints(); + + for(DefaultLegalConstraints c:constraintList){ + for(Restriction r:c.getAccessConstraints()){ + metadata.add("AccessContraints ",r.name()); + } + for(InternationalString s:c.getOtherConstraints()){ + metadata.add("OtherConstraints ",s.toString()); + } + for(Restriction r:c.getUseConstraints()) { + metadata.add("UserConstraints ",r.name()); + } + + } + Collection<Extent> extentList=((DefaultDataIdentification) i).getExtents(); + for(Extent e:extentList){ + ArrayList<GeographicExtent> geoElements= (ArrayList<GeographicExtent>) e.getGeographicElements(); + for(GeographicExtent g:geoElements){ + + if(g instanceof DefaultGeographicDescription){ + if(((DefaultGeographicDescription) g).getGeographicIdentifier()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getCode()!=null ) + metadata.add("GeographicIdentifierCode ",((DefaultGeographicDescription) g).getGeographicIdentifier().getCode().toString()); + if(((DefaultGeographicDescription) g).getGeographicIdentifier()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getTitle()!=null ) + metadata.add("GeographicIdentifierAuthorityTitle ",((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getTitle().toString()); + + for(InternationalString s:((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getAlternateTitles()) { + metadata.add("GeographicIdentifierAuthorityAlternativeTitle ",s.toString()); + } + for(CitationDate cd:((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getDates()){ + if(cd.getDateType()!=null && cd.getDate()!=null) + metadata.add("GeographicIdentifierAuthorityDate ",cd.getDateType().name()+" "+cd.getDate().toString()); + } + } + } + } + } + } + + private void getMetaDataDistributionInfo(Metadata metadata, DefaultMetadata defaultMetaData){ + Distribution distribution=defaultMetaData.getDistributionInfo(); + ArrayList<Format> distributionFormat= (ArrayList<Format>) distribution.getDistributionFormats(); + for(Format f:distributionFormat){ + if(f.getName()!=null) + metadata.add("DistributionFormatSpecificationAlternativeTitle ",f.getName().toString()); + } + ArrayList<Distributor> distributorList= (ArrayList<Distributor>) distribution.getDistributors(); + for(Distributor d:distributorList){ + if(d!=null && d.getDistributorContact()!=null && d.getDistributorContact().getRole()!=null) + metadata.add("Distributor Contact ",d.getDistributorContact().getRole().name()); + if(d!=null && d.getDistributorContact()!=null && d.getDistributorContact().getOrganisationName()!=null) + metadata.add("Distributor Organization Name ",d.getDistributorContact().getOrganisationName().toString()); + } + ArrayList<DigitalTransferOptions> transferOptionsList= (ArrayList<DigitalTransferOptions>) distribution.getTransferOptions(); + for(DigitalTransferOptions d:transferOptionsList){ + ArrayList<OnlineResource> onlineResourceList= (ArrayList<OnlineResource>) d.getOnLines(); + for(OnlineResource or:onlineResourceList){ + if(or.getLinkage()!=null) + metadata.add("TransferOptionsOnlineLinkage ",or.getLinkage().toString()); + if(or.getProtocol()!=null) + metadata.add("TransferOptionsOnlineProtocol ",or.getProtocol()); + if(or.getApplicationProfile()!=null) + metadata.add("TransferOptionsOnlineProfile ",or.getApplicationProfile()); + if(or.getName()!=null) + metadata.add("TransferOptionsOnlineName ",or.getName()); + if(or.getDescription()!=null) + metadata.add("TransferOptionsOnlineDescription ",or.getDescription().toString()); + if(or.getFunction()!=null) + metadata.add("TransferOptionsOnlineFunction ",or.getFunction().name()); + + } + } + } + + private void getMetaDataDateInfo(Metadata metadata, DefaultMetadata defaultMetaData){ + ArrayList<CitationDate> citationDateList= (ArrayList<CitationDate>) defaultMetaData.getDateInfo(); + for(CitationDate c:citationDateList){ + if(c.getDateType()!=null) + metadata.add("DateInfo ",c.getDateType().name()+" "+c.getDate()); + } + } + + private void getMetaDataResourceScope(Metadata metadata, DefaultMetadata defaultMetaData){ + ArrayList<DefaultMetadataScope> scopeList= (ArrayList<DefaultMetadataScope>) defaultMetaData.getMetadataScopes(); + for(DefaultMetadataScope d:scopeList){ + if(d.getResourceScope()!=null) + metadata.add("MetaDataResourceScope ",d.getResourceScope().name()); + } + } + + private void getMetaDataParentMetaDataTitle(Metadata metadata, DefaultMetadata defaultMetaData){ + Citation parentMetaData=defaultMetaData.getParentMetadata(); + if(parentMetaData!=null && parentMetaData.getTitle()!=null) + metadata.add("ParentMetaDataTitle",parentMetaData.getTitle().toString()); + } + + private void getMetaDataIdetifierCode(Metadata metadata, DefaultMetadata defaultMetaData){ + Identifier identifier= defaultMetaData.getMetadataIdentifier(); + if(identifier!=null) + metadata.add("MetaDataIdentifierCode",identifier.getCode()); + } + + private void getMetaDataStandard(Metadata metadata, DefaultMetadata defaultMetaData){ + ArrayList<Citation> citationList= (ArrayList<Citation>) defaultMetaData.getMetadataStandards(); + for(Citation c:citationList){ + if(c.getTitle()!=null) + metadata.add("MetaDataStandardTitle ",c.getTitle().toString()); + if(c.getEdition()!=null) + metadata.add("MetaDataStandardEdition ",c.getEdition().toString()); + } + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/grib/GribParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/grib/GribParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/grib/GribParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/grib/GribParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.grib; + +import java.io.IOException; +import java.io.InputStream; +import java.io.File; +import java.util.Collections; +import java.util.Set; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import ucar.nc2.Attribute; +import ucar.nc2.Dimension; +import ucar.nc2.NetcdfFile; +import ucar.nc2.Variable; +import ucar.nc2.dataset.NetcdfDataset; + +public class GribParser extends AbstractParser { + + private static final long serialVersionUID = 7855458954474247655L; + + public static final String GRIB_MIME_TYPE = "application/x-grib2"; + + private final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("x-grib2")); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + //Set MIME type as grib2 + metadata.set(Metadata.CONTENT_TYPE, GRIB_MIME_TYPE); + + TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); + File gribFile = tis.getFile(); + + try { + NetcdfFile ncFile = NetcdfDataset.openFile(gribFile.getAbsolutePath(), null); + + // first parse out the set of global attributes + for (Attribute attr : ncFile.getGlobalAttributes()) { + Property property = resolveMetadataKey(attr.getFullName()); + if (attr.getDataType().isString()) { + metadata.add(property, attr.getStringValue()); + } else if (attr.getDataType().isNumeric()) { + int value = attr.getNumericValue().intValue(); + metadata.add(property, String.valueOf(value)); + } + } + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + + xhtml.startDocument(); + + xhtml.newline(); + xhtml.startElement("ul"); + xhtml.characters("dimensions:"); + xhtml.newline(); + + for (Dimension dim : ncFile.getDimensions()){ + xhtml.element("li", dim.getFullName() + "=" + String.valueOf(dim.getLength()) + ";"); + xhtml.newline(); + } + + xhtml.startElement("ul"); + xhtml.characters("variables:"); + xhtml.newline(); + + for (Variable var : ncFile.getVariables()){ + xhtml.element("p", String.valueOf(var.getDataType()) + var.getNameAndDimensions() + ";"); + for(Attribute element : var.getAttributes()){ + xhtml.element("li", " :" + element + ";"); + xhtml.newline(); + } + } + xhtml.endElement("ul"); + xhtml.endElement("ul"); + xhtml.endDocument(); + + } catch (IOException e) { + throw new TikaException("NetCDF parse error", e); + } + } + + private Property resolveMetadataKey(String localName) { + if ("title".equals(localName)) { + return TikaCoreProperties.TITLE; + } + return Property.internalText(localName); + } + +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.hdf; + +//JDK imports +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.netcdf.NetCDFParser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import ucar.nc2.Attribute; +import ucar.nc2.Group; +import ucar.nc2.NetcdfFile; + +/** + * + * Since the {@link NetCDFParser} depends on the <a + * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API, + * we are able to use it to parse HDF files as well. See <a href= + * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html" + * >this link</a> for more information. + */ +public class HDFParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = 1091208208003437549L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("x-hdf")); + + /* + * (non-Javadoc) + * + * @see + * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache + * .tika.parser.ParseContext) + */ + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream, + * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, + * org.apache.tika.parser.ParseContext) + */ + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + IOUtils.copy(stream, os); + + String name = metadata.get(Metadata.RESOURCE_NAME_KEY); + if (name == null) { + name = ""; + } + try { + NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray()); + unravelStringMet(ncFile, null, metadata); + } catch (IOException e) { + throw new TikaException("HDF parse error", e); + } + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.endDocument(); + } + + protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) { + if (group == null) { + group = ncFile.getRootGroup(); + } + + // get file type + met.set("File-Type-Description", ncFile.getFileTypeDescription()); + // unravel its string attrs + for (Attribute attribute : group.getAttributes()) { + if (attribute.isString()) { + met.add(attribute.getFullName(), attribute.getStringValue()); + } else { + // try and cast its value to a string + met.add(attribute.getFullName(), String.valueOf(attribute + .getNumericValue())); + } + } + + for (Group g : group.getGroups()) { + unravelStringMet(ncFile, g, met); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.isatab; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Locale; +import java.util.Map; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +public class ISATabUtils { + + private static final ServiceLoader LOADER = new ServiceLoader(ISATabUtils.class.getClassLoader()); + + /** + * INVESTIGATION + */ + + // Investigation section. + private static final String[] sections = { + "ONTOLOGY SOURCE REFERENCE", + "INVESTIGATION", + "INVESTIGATION PUBLICATIONS", + "INVESTIGATION CONTACTS" + }; + + // STUDY section (inside the Study section) + private static final String studySectionField = "STUDY"; + + // Study File Name (inside the STUDY section) + private static final String studyFileNameField = "Study File Name"; + + public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata, ParseContext context, String studyFileName) throws IOException, TikaException, SAXException { + // Automatically detect the character encoding + try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), + metadata, context.get(ServiceLoader.class, LOADER))) { + extractMetadata(reader, metadata, studyFileName); + } + } + + public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { + parseInvestigation(stream, handler, metadata, context, null); + } + + public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { + TikaInputStream tis = TikaInputStream.get(stream); + // Automatically detect the character encoding + + try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), + metadata, context.get(ServiceLoader.class, LOADER)); + CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) { + Iterator<CSVRecord> iterator = csvParser.iterator(); + + xhtml.startElement("table"); + + xhtml.startElement("thead"); + if (iterator.hasNext()) { + CSVRecord record = iterator.next(); + for (int i = 0; i < record.size(); i++) { + xhtml.startElement("th"); + xhtml.characters(record.get(i)); + xhtml.endElement("th"); + } + } + xhtml.endElement("thead"); + + xhtml.startElement("tbody"); + while (iterator.hasNext()) { + CSVRecord record = iterator.next(); + xhtml.startElement("tr"); + for (int j = 0; j < record.size(); j++) { + xhtml.startElement("td"); + xhtml.characters(record.get(j)); + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + } + xhtml.endElement("tbody"); + + xhtml.endElement("table"); + } + } + + public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { + TikaInputStream tis = TikaInputStream.get(stream); + + // Automatically detect the character encoding + + try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), + metadata, context.get(ServiceLoader.class, LOADER)); + CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) { + xhtml.startElement("table"); + + Iterator<CSVRecord> iterator = csvParser.iterator(); + + xhtml.startElement("thead"); + if (iterator.hasNext()) { + CSVRecord record = iterator.next(); + for (int i = 0; i < record.size(); i++) { + xhtml.startElement("th"); + xhtml.characters(record.get(i)); + xhtml.endElement("th"); + } + } + xhtml.endElement("thead"); + + xhtml.startElement("tbody"); + while (iterator.hasNext()) { + CSVRecord record = iterator.next(); + xhtml.startElement("tr"); + for (int j = 0; j < record.size(); j++) { + xhtml.startElement("td"); + xhtml.characters(record.get(j)); + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + } + xhtml.endElement("tbody"); + + xhtml.endElement("table"); + } + } + + private static void extractMetadata(Reader reader, Metadata metadata, String studyFileName) throws IOException { + boolean investigationSection = false; + boolean studySection = false; + boolean studyTarget = false; + + Map<String, String> map = new HashMap<String, String>(); + + try (CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) { + Iterator<CSVRecord> iterator = csvParser.iterator(); + + while (iterator.hasNext()) { + CSVRecord record = iterator.next(); + String field = record.get(0); + if ((field.toUpperCase(Locale.ENGLISH).equals(field)) && (record.size() == 1)) { + investigationSection = Arrays.asList(sections).contains(field); + studySection = (studyFileName != null) && (field.equals(studySectionField)); + } else { + if (investigationSection) { + addMetadata(field, record, metadata); + } else if (studySection) { + if (studyTarget) { + break; + } + String value = record.get(1); + map.put(field, value); + studyTarget = (field.equals(studyFileNameField)) && (value.equals(studyFileName)); + if (studyTarget) { + mapStudyToMetadata(map, metadata); + studySection = false; + } + } else if (studyTarget) { + addMetadata(field, record, metadata); + } + } + } + } catch (IOException ioe) { + throw ioe; + } + } + + private static void addMetadata(String field, CSVRecord record, Metadata metadata) { + if ((record ==null) || (record.size() <= 1)) { + return; + } + + for (int i = 1; i < record.size(); i++) { + metadata.add(field, record.get(i)); + } + } + + private static void mapStudyToMetadata(Map<String, String> map, Metadata metadata) { + for (Map.Entry<String, String> entry : map.entrySet()) { + metadata.add(entry.getKey(), entry.getValue()); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.isatab; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class ISArchiveParser implements Parser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = 3640809327541300229L; + + private final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-isatab")); + + private static String studyAssayFileNameField = "Study Assay File Name"; + + private String location = null; + + private String studyFileName = null; + + /** + * Default constructor. + */ + public ISArchiveParser() { + this(null); + } + + /** + * Constructor that accepts the pathname of ISArchive folder. + * @param location pathname of ISArchive folder including ISA-Tab files + */ + public ISArchiveParser(String location) { + if (location != null && !location.endsWith(File.separator)) { + location += File.separator; + } + this.location = location; + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + TikaInputStream tis = TikaInputStream.get(stream); + if (this.location == null) { + this.location = tis.getFile().getParent() + File.separator; + } + this.studyFileName = tis.getFile().getName(); + + File locationFile = new File(location); + String[] investigationList = locationFile.list(new FilenameFilter() { + + @Override + public boolean accept(File dir, String name) { + return name.matches("i_.+\\.txt"); + } + }); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + parseInvestigation(investigationList, xhtml, metadata, context); + parseStudy(stream, xhtml, metadata, context); + parseAssay(xhtml, metadata, context); + + xhtml.endDocument(); + } + + private void parseInvestigation(String[] investigationList, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + if ((investigationList == null) || (investigationList.length == 0)) { + // TODO warning + return; + } + if (investigationList.length > 1) { + // TODO warning + return; + } + + String investigation = investigationList[0]; // TODO add to metadata? + InputStream stream = TikaInputStream.get(new File(this.location + investigation)); + + ISATabUtils.parseInvestigation(stream, xhtml, metadata, context, this.studyFileName); + + xhtml.element("h1", "INVESTIGATION " + metadata.get("Investigation Identifier")); + } + + private void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + xhtml.element("h2", "STUDY " + metadata.get("Study Identifier")); + + ISATabUtils.parseStudy(stream, xhtml, metadata, context); + } + + private void parseAssay(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + for (String assayFileName : metadata.getValues(studyAssayFileNameField)) { + xhtml.startElement("div"); + xhtml.element("h3", "ASSAY " + assayFileName); + InputStream stream = TikaInputStream.get(new File(this.location + assayFileName)); + ISATabUtils.parseAssay(stream, xhtml, metadata, context); + xhtml.endElement("div"); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/mat/MatParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/mat/MatParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/mat/MatParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/mat/MatParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mat; + +//JDK imports +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; +import java.util.Map; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.mime.MediaType; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +//JMatIO imports +import com.jmatio.io.MatFileHeader; +import com.jmatio.io.MatFileReader; +import com.jmatio.types.MLArray; +import com.jmatio.types.MLStructure; + +import static java.nio.charset.StandardCharsets.UTF_8; + + +public class MatParser extends AbstractParser { + + public static final String MATLAB_MIME_TYPE = + "application/x-matlab-data"; + + private final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("x-matlab-data")); + + public Set<MediaType> getSupportedTypes(ParseContext context){ + return SUPPORTED_TYPES; + } + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + //Set MIME type as Matlab + metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE); + + try { + // Use TIS so we can spool a temp file for parsing. + TikaInputStream tis = TikaInputStream.get(stream); + + //Extract information from header file + MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat file + MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header information + + // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 23:41:57 2014" + String[] parts = hdr.getDescription().split(","); // Break header information into its parts + + if (parts[2].contains("Created")) { + int lastIndex1 = parts[2].lastIndexOf("Created on:"); + String dateCreated = parts[2].substring(lastIndex1 + "Created on:".length()).trim(); + metadata.set("createdOn", dateCreated); + } + + if (parts[1].contains("Platform")) { + int lastIndex2 = parts[1].lastIndexOf("Platform:"); + String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim(); + metadata.set("platform" , platform); + } + + if (parts[0].contains("MATLAB")) { + metadata.set("fileType", parts[0]); + } + + // Get endian indicator from header file + String endianBytes = new String(hdr.getEndianIndicator(), UTF_8); // Retrieve endian bytes and convert to string + String endianCode = String.valueOf(endianBytes.toCharArray()); // Convert bytes to characters to string + metadata.set("endian", endianCode); + + //Text output + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.newline(); + //Loop through each variable + for (Map.Entry<String, MLArray> entry : mfr.getContent().entrySet()) { + String varName = entry.getKey(); + MLArray varData = entry.getValue(); + + xhtml.element("p", varName + ":" + String.valueOf(varData)); + + // If the variable is a structure, extract variable info from structure + if (varData.isStruct()){ + MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName); + xhtml.startElement("ul"); + xhtml.newline(); + for (MLArray element : mlStructure.getAllFields()){ + xhtml.startElement("li"); + xhtml.characters(String.valueOf(element)); + + // If there is an embedded structure, extract variable info. + if (element.isStruct()){ + xhtml.startElement("ul"); + // Should this actually be a recursive call? + xhtml.element("li", element.contentToString()); + xhtml.endElement("ul"); + } + + xhtml.endElement("li"); + } + xhtml.endElement("ul"); + } + } + xhtml.endDocument(); + } catch (IOException e) { + throw new TikaException("Error parsing Matlab file with MatParser", e); + } + } +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.netcdf; + +//JDK imports + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; +import java.util.List; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import ucar.nc2.Attribute; +import ucar.nc2.NetcdfFile; +import ucar.nc2.Variable; +import ucar.nc2.Dimension; + +/** + * A {@link Parser} for <a + * href="http://www.unidata.ucar.edu/software/netcdf/index.html">NetCDF</a> + * files using the UCAR, MIT-licensed <a + * href="http://www.unidata.ucar.edu/software/netcdf-java/">NetCDF for Java</a> + * API. + */ +public class NetCDFParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = -5940938274907708665L; + + private final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("x-netcdf")); + + /* + * (non-Javadoc) + * + * @see + * org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser + * .ParseContext) + */ + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /* + * (non-Javadoc) + * + * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, + * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, + * org.apache.tika.parser.ParseContext) + */ + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); + try { + NetcdfFile ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath()); + metadata.set("File-Type-Description", ncFile.getFileTypeDescription()); + // first parse out the set of global attributes + for (Attribute attr : ncFile.getGlobalAttributes()) { + Property property = resolveMetadataKey(attr.getFullName()); + if (attr.getDataType().isString()) { + metadata.add(property, attr.getStringValue()); + } else if (attr.getDataType().isNumeric()) { + int value = attr.getNumericValue().intValue(); + metadata.add(property, String.valueOf(value)); + } + } + + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.newline(); + xhtml.element("h1", "dimensions"); + xhtml.startElement("ul"); + xhtml.newline(); + for (Dimension dim : ncFile.getDimensions()) { + xhtml.element("li", dim.getFullName() + " = " + dim.getLength()); + } + xhtml.endElement("ul"); + + xhtml.element("h1", "variables"); + xhtml.startElement("ul"); + xhtml.newline(); + for (Variable var : ncFile.getVariables()) { + xhtml.startElement("li"); + xhtml.characters(var.getDataType() + " " + var.getNameAndDimensions()); + xhtml.newline(); + List<Attribute> attributes = var.getAttributes(); + if (!attributes.isEmpty()) { + xhtml.startElement("ul"); + for (Attribute element : attributes) { + xhtml.element("li", element.toString()); + } + xhtml.endElement("ul"); + } + xhtml.endElement("li"); + } + xhtml.endElement("ul"); + + xhtml.endDocument(); + + } catch (IOException e) { + throw new TikaException("NetCDF parse error", e); + } + } + + private Property resolveMetadataKey(String localName) { + if ("title".equals(localName)) { + return TikaCoreProperties.TITLE; + } + return Property.internalText(localName); + } +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pot; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.BufferedReader; +import java.util.logging.Logger; +import org.apache.commons.exec.CommandLine; +import org.apache.commons.exec.DefaultExecutor; +import org.apache.commons.exec.ExecuteWatchdog; +import org.apache.commons.exec.PumpStreamHandler; +import org.apache.commons.exec.environment.EnvironmentUtils; +import org.xml.sax.helpers.AttributesImpl; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import static java.nio.charset.StandardCharsets.UTF_8; + +public class PooledTimeSeriesParser extends AbstractParser { + + private static final long serialVersionUID = -2855917932512164988L; + private static final Set<MediaType> SUPPORTED_TYPES = Collections + .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(new MediaType[] { + MediaType.video("avi"), MediaType.video("mp4") + // TODO: Add all supported video types + }))); + + private static final Logger LOG = Logger.getLogger(PooledTimeSeriesParser.class.getName()); + + public boolean isAvailable() { + return ExternalParser.check( + new String[] { "pooled-time-series", "--help" }, -1); + } + + /** + * Returns the set of media types supported by this parser when used with the + * given parse context. + * + * @param context + * parse context + * @return immutable set of media types + * @since Apache Tika 0.7 + */ + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * Parses a document stream into a sequence of XHTML SAX events. Fills in + * related document metadata in the given metadata object. + * <p> + * The given document stream is consumed but not closed by this method. The + * responsibility to close the stream remains on the caller. + * <p> + * Information about the parsing context can be passed in the context + * parameter. See the parser implementations for the kinds of context + * information they expect. + * + * @param stream + * the document stream (input) + * @param handler + * handler for the XHTML SAX events (output) + * @param metadata + * document metadata (input and output) + * @param context + * parse context + * @throws IOException + * if the document stream could not be read + * @throws SAXException + * if the SAX events could not be processed + * @throws TikaException + * if the document could not be parsed + * @since Apache Tika 0.5 + */ + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + if (!isAvailable()) { + LOG.warning( + "PooledTimeSeries not installed!"); + return; + } + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + + TemporaryResources tmp = new TemporaryResources(); + File output = null; + try { + TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); + File input = tikaStream.getFile(); + String cmdOutput = computePoT(input); + FileInputStream ofStream = new FileInputStream(new File( + input.getAbsoluteFile() + ".of.txt")); + FileInputStream ogStream = new FileInputStream(new File( + input.getAbsoluteFile() + ".hog.txt")); + extractHeaderOutput(ofStream, metadata, "of"); + extractHeaderOutput(ogStream, metadata, "og"); + xhtml.startDocument(); + doExtract(ofStream, xhtml, "Histogram of Optical Flows (HOF)", + metadata.get("of_frames"), metadata.get("of_vecSize")); + doExtract(ogStream, xhtml, "Histogram of Oriented Gradients (HOG)", + metadata.get("og_frames"), metadata.get("og_vecSize")); + xhtml.endDocument(); + + } finally { + tmp.dispose(); + if (output != null) { + output.delete(); + } + } + } + + private String computePoT(File input) + throws IOException, TikaException { + + CommandLine cmdLine = new CommandLine("pooled-time-series"); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + cmdLine.addArgument("-f"); + cmdLine.addArgument(input.getAbsolutePath()); + LOG.fine("Executing: " + cmdLine); + DefaultExecutor exec = new DefaultExecutor(); + exec.setExitValue(0); + ExecuteWatchdog watchdog = new ExecuteWatchdog(60000); + exec.setWatchdog(watchdog); + PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream); + exec.setStreamHandler(streamHandler); + int exitValue = exec + .execute(cmdLine, EnvironmentUtils.getProcEnvironment()); + return outputStream.toString("UTF-8"); + + } + + /** + * Reads the contents of the given stream and write it to the given XHTML + * content handler. The stream is closed once fully processed. + * + * @param stream + * Stream where is the result of ocr + * @param xhtml + * XHTML content handler + * @param tableTitle + * The name of the matrix/table to display. + * @param frames + * Number of frames read from the video. + * @param vecSize + * Size of the OF or HOG vector. + * @throws SAXException + * if the XHTML SAX events could not be handled + * @throws IOException + * if an input error occurred + */ + private void doExtract(InputStream stream, XHTMLContentHandler xhtml, + String tableTitle, String frames, String vecSize) throws SAXException, + IOException { + BufferedReader reader = new BufferedReader(new InputStreamReader(stream, + UTF_8)); + String line = null; + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "", "rows", "CDATA", frames); + attributes.addAttribute("", "", "cols", "CDATA", vecSize); + + xhtml.startElement("h3"); + xhtml.characters(tableTitle); + xhtml.endElement("h3"); + xhtml.startElement("table", attributes); + while ((line = reader.readLine()) != null) { + xhtml.startElement("tr"); + for (String val : line.split(" ")) { + xhtml.startElement("td"); + xhtml.characters(val); + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + } + xhtml.endElement("table"); + } + + private void extractHeaderOutput(InputStream stream, Metadata metadata, + String prefix) throws IOException { + BufferedReader reader = new BufferedReader(new InputStreamReader(stream, + UTF_8)); + String line = reader.readLine(); + String[] firstLine = line.split(" "); + String frames = firstLine[0]; + String vecSize = firstLine[1]; + + if (prefix == null) { + prefix = ""; + } + metadata.add(prefix + "_frames", frames); + metadata.add(prefix + "_vecSize", vecSize); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016 @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#org.apache.tika.parser.ctakes.CTAKESParser +org.apache.tika.parser.dif.DIFParser +org.apache.tika.parser.gdal.GDALParser +org.apache.tika.parser.geo.topic.GeoParser +org.apache.tika.parser.geoinfo.GeographicInformationParser +org.apache.tika.parser.grib.GribParser +org.apache.tika.parser.hdf.HDFParser +org.apache.tika.parser.isatab.ISArchiveParser +org.apache.tika.parser.mat.MatParser +org.apache.tika.parser.netcdf.NetCDFParser +org.apache.tika.parser.pot.PooledTimeSeriesParser +#org.apache.tika.parser.envi.EnviHeaderParser Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dif; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +import java.io.InputStream; + +import static org.junit.Assert.assertEquals; + +public class DIFParserTest extends TikaTest { + + @Test + public void testDifMetadata() throws Exception { + Parser parser = new DIFParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = DIFParser.class.getResourceAsStream( + "/test-documents/Zamora2010.dif")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + + assertEquals(metadata.get("DIF-Entry_ID"),"00794186-48f9-11e3-9dcb-00c0f03d5b7c"); + assertEquals(metadata.get("DIF-Metadata_Name"),"ACADIS IDN DIF"); + + String content = handler.toString(); + assertContains("Title: Zamora 2010 Using Sediment Geochemistry", content); + assertContains("Southernmost_Latitude : 78.833", content); + assertContains("Northernmost_Latitude : 79.016", content); + assertContains("Westernmost_Longitude : 11.64", content); + assertContains("Easternmost_Longitude : 13.34", content); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.envi; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertNotNull; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.Test; + +/** + * Test cases to exercise the {@link EnviHeaderParser}. + */ +public class EnviHeaderParserTest { + @Test + public void testParseGlobalMetadata() throws Exception { + if (System.getProperty("java.version").startsWith("1.5")) { + return; + } + + Parser parser = new EnviHeaderParser(); + ToXMLContentHandler handler = new ToXMLContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = EnviHeaderParser.class.getResourceAsStream( + "/test-documents/envi_test_header.hdr")) { + assertNotNull("Test ENVI file not found", stream); + parser.parse(stream, handler, metadata, new ParseContext()); + } + + // Check content of test file + String content = handler.toString(); + assertContains("<body><p>ENVI</p>", content); + assertContains("<p>samples = 2400</p>", content); + assertContains("<p>lines = 2400</p>", content); + assertContains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>", content); + assertContains("content=\"application/envi.hdr\"", content); + assertContains("projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", content); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,181 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.gdal; + +//JDK imports + +import java.io.IOException; +import java.io.InputStream; + + +//Tika imports +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.sax.BodyContentHandler; + +//Junit imports +import org.junit.Test; +import org.xml.sax.SAXException; + +import static org.junit.Assert.fail; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assume.assumeTrue; + +/** + * Test harness for the GDAL parser. + */ +public class TestGDALParser extends TikaTest { + + private boolean canRun() { + String[] checkCmd = {"gdalinfo"}; + // If GDAL is not on the path, do not run the test. + return ExternalParser.check(checkCmd); + } + + @Test + public void testParseBasicInfo() { + assumeTrue(canRun()); + final String expectedDriver = "netCDF/Network Common Data Format"; + final String expectedUpperRight = "512.0, 0.0"; + final String expectedUpperLeft = "0.0, 0.0"; + final String expectedLowerLeft = "0.0, 512.0"; + final String expectedLowerRight = "512.0, 512.0"; + final String expectedCoordinateSystem = "`'"; + final String expectedSize = "512, 512"; + + GDALParser parser = new GDALParser(); + InputStream stream = TestGDALParser.class + .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc"); + Metadata met = new Metadata(); + BodyContentHandler handler = new BodyContentHandler(); + try { + parser.parse(stream, handler, met, new ParseContext()); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + + assertNotNull(met); + assertNotNull(met.get("Driver")); + assertEquals(expectedDriver, met.get("Driver")); + assumeTrue(met.get("Files") != null); + assertNotNull(met.get("Coordinate System")); + assertEquals(expectedCoordinateSystem, met.get("Coordinate System")); + assertNotNull(met.get("Size")); + assertEquals(expectedSize, met.get("Size")); + assertNotNull(met.get("Upper Right")); + assertEquals(expectedUpperRight, met.get("Upper Right")); + assertNotNull(met.get("Upper Left")); + assertEquals(expectedUpperLeft, met.get("Upper Left")); + assertNotNull(met.get("Upper Right")); + assertEquals(expectedLowerRight, met.get("Lower Right")); + assertNotNull(met.get("Upper Right")); + assertEquals(expectedLowerLeft, met.get("Lower Left")); + + } + + @Test + public void testParseMetadata() { + assumeTrue(canRun()); + final String expectedNcInst = "NCAR (National Center for Atmospheric Research, Boulder, CO, USA)"; + final String expectedModelNameEnglish = "NCAR CCSM"; + final String expectedProgramId = "Source file unknown Version unknown Date unknown"; + final String expectedProjectId = "IPCC Fourth Assessment"; + final String expectedRealization = "1"; + final String expectedTitle = "model output prepared for IPCC AR4"; + final String expectedSub8Name = "\":ua"; + final String expectedSub8Desc = "[1x17x128x256] eastward_wind (32-bit floating-point)"; + + GDALParser parser = new GDALParser(); + InputStream stream = TestGDALParser.class + .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc"); + Metadata met = new Metadata(); + BodyContentHandler handler = new BodyContentHandler(); + try { + parser.parse(stream, handler, met, new ParseContext()); + assertNotNull(met); + assertNotNull(met.get("NC_GLOBAL#institution")); + assertEquals(expectedNcInst, met.get("NC_GLOBAL#institution")); + assertNotNull(met.get("NC_GLOBAL#model_name_english")); + assertEquals(expectedModelNameEnglish, + met.get("NC_GLOBAL#model_name_english")); + assertNotNull(met.get("NC_GLOBAL#prg_ID")); + assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID")); + assertNotNull(met.get("NC_GLOBAL#prg_ID")); + assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID")); + assertNotNull(met.get("NC_GLOBAL#project_id")); + assertEquals(expectedProjectId, met.get("NC_GLOBAL#project_id")); + assertNotNull(met.get("NC_GLOBAL#realization")); + assertEquals(expectedRealization, met.get("NC_GLOBAL#realization")); + assertNotNull(met.get("NC_GLOBAL#title")); + assertEquals(expectedTitle, met.get("NC_GLOBAL#title")); + assertNotNull(met.get("SUBDATASET_8_NAME")); + assertTrue(met.get("SUBDATASET_8_NAME").endsWith(expectedSub8Name)); + assertNotNull(met.get("SUBDATASET_8_DESC")); + assertEquals(expectedSub8Desc, met.get("SUBDATASET_8_DESC")); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + + @Test + public void testParseFITS() { + String fitsFilename = "/test-documents/WFPC2u5780205r_c0fx.fits"; + + assumeTrue(canRun()); + // If the exit code is 1 (meaning FITS isn't supported by the installed version of gdalinfo, don't run this test. + String[] fitsCommand = {"gdalinfo", TestGDALParser.class.getResource(fitsFilename).getPath()}; + assumeTrue(ExternalParser.check(fitsCommand, 1)); + + String expectedAllgMin = "-7.319537E1"; + String expectedAtodcorr = "COMPLETE"; + String expectedAtodfile = "uref$dbu1405iu.r1h"; + String expectedCalVersion = " "; + String expectedCalibDef = "1466"; + + GDALParser parser = new GDALParser(); + InputStream stream = TestGDALParser.class + .getResourceAsStream(fitsFilename); + Metadata met = new Metadata(); + BodyContentHandler handler = new BodyContentHandler(); + try { + parser.parse(stream, handler, met, new ParseContext()); + assertNotNull(met); + assertNotNull(met.get("ALLG-MIN")); + assertEquals(expectedAllgMin, met.get("ALLG-MIN")); + assertNotNull(met.get("ATODCORR")); + assertEquals(expectedAtodcorr, met.get("ATODCORR")); + assertNotNull(met.get("ATODFILE")); + assertEquals(expectedAtodfile, met.get("ATODFILE")); + assertNotNull(met.get("CAL_VER")); + assertEquals(expectedCalVersion, met.get("CAL_VER")); + assertNotNull(met.get("CALIBDEF")); + assertEquals(expectedCalibDef, met.get("CALIBDEF")); + + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } +}
