http://git-wip-us.apache.org/repos/asf/marmotta/blob/00c22e7c/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/FeedParserBase.java ---------------------------------------------------------------------- diff --git a/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/FeedParserBase.java b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/FeedParserBase.java new file mode 100644 index 0000000..10364ef --- /dev/null +++ b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/FeedParserBase.java @@ -0,0 +1,317 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.marmotta.commons.sesame.rio.rss; + +import com.sun.syndication.feed.module.DCModule; +import com.sun.syndication.feed.module.DCSubject; +import org.openrdf.model.Literal; +import org.openrdf.model.Resource; +import org.openrdf.model.URI; +import org.openrdf.model.ValueFactory; +import org.openrdf.model.impl.ValueFactoryImpl; +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.RDFParseException; +import org.openrdf.rio.helpers.RDFParserBase; +import org.rometools.feed.module.content.ContentModule; +import org.rometools.feed.module.georss.GeoRSSModule; +import org.rometools.feed.module.mediarss.MediaEntryModule; +import org.rometools.feed.module.mediarss.types.MediaContent; +import org.rometools.feed.module.mediarss.types.Metadata; +import org.rometools.feed.module.mediarss.types.UrlReference; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.datatype.DatatypeConfigurationException; +import javax.xml.datatype.DatatypeFactory; +import javax.xml.datatype.XMLGregorianCalendar; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.Date; +import java.util.GregorianCalendar; +import java.util.TimeZone; + +/** + * Common functionality for RSS and Atom feed parsing + * <p/> + * Author: Sebastian Schaffert + */ +public abstract class FeedParserBase extends RDFParserBase { + private static Logger log = LoggerFactory.getLogger(FeedParserBase.class); + + + protected static final String NS_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + protected static final String NS_DC = "http://purl.org/dc/elements/1.1/"; + protected static final String NS_DC_TERMS = "http://purl.org/dc/terms/"; + protected static final String NS_SIOC = "http://rdfs.org/sioc/ns#"; + protected static final String NS_SKOS = "http://www.w3.org/2004/02/skos/core#"; + protected static final String NS_RSS = "http://purl.org/rss/1.0/"; + protected static final String NS_RSS_CONTENT = "http://purl.org/rss/1.0/modules/content/"; + protected static final String NS_RSS_SY = "http://purl.org/rss/1.0/modules/syndication/"; + protected static final String NS_ADMIN = "http://webns.net/mvcb/"; + protected static final String NS_FOAF = "http://xmlns.com/foaf/0.1/"; + protected static final String NS_GEO = "http://www.w3.org/2003/01/geo/wgs84_pos#"; + protected static final String NS_MA = "http://www.w3.org/ns/ma-ont#"; + + + protected ValueFactory valueFactory; + + + /** + * Creates a new RDFParserBase that will use a {@link org.openrdf.model.impl.ValueFactoryImpl} to + * create RDF model objects. + */ + protected FeedParserBase() { + this(new ValueFactoryImpl()); + } + + /** + * Creates a new RDFParserBase that will use the supplied ValueFactory to + * create RDF model objects. + * + * @param valueFactory A ValueFactory. + */ + protected FeedParserBase(ValueFactory valueFactory) { + super(valueFactory); + this.valueFactory = valueFactory; + } + + @Override + public void setValueFactory(ValueFactory valueFactory) { + super.setValueFactory(valueFactory); + this.valueFactory = valueFactory; + } + + + protected void parseDCModule(Resource resource, DCModule dcModule) throws RDFHandlerException, RDFParseException { + for(String contributor : dcModule.getContributors()) { + createStringProperty(resource, NS_DC_TERMS + "contributor", contributor); + } + for(String coverage : dcModule.getCoverages()) { + createStringProperty(resource, NS_DC_TERMS + "coverage", coverage); + } + for(String creator : dcModule.getCreators()) { + createStringProperty(resource, NS_DC_TERMS + "creator", creator); + } + for(Date date : dcModule.getDates()) { + createDateProperty(resource, NS_DC_TERMS + "date", date); + } + for(String description : dcModule.getDescriptions()) { + createStringProperty(resource, NS_DC_TERMS + "description", description); + } + for(String format : dcModule.getFormats()) { + createStringProperty(resource, NS_DC_TERMS + "format", format); + } + for(String identifier : dcModule.getIdentifiers()) { + createStringProperty(resource, NS_DC_TERMS + "identifier", identifier); + } + for(String language : dcModule.getLanguages()) { + createStringProperty(resource, NS_DC_TERMS + "language", language); + } + for(String publisher : dcModule.getPublishers()) { + createStringProperty(resource, NS_DC_TERMS + "publisher", publisher); + } + for(String relation : dcModule.getRelations()) { + createUrlProperty(resource, NS_DC_TERMS + "relation", relation); + } + for(String rights : dcModule.getRightsList()) { + createStringProperty(resource, NS_DC_TERMS + "rights", rights); + } + for(String source : dcModule.getSources()) { + createUrlProperty(resource, NS_DC_TERMS + "source", source); + } + for(DCSubject subject : dcModule.getSubjects()) { + parseDCSubject(resource, subject); + } + for(String title : dcModule.getTitles()) { + createStringProperty(resource, NS_DC_TERMS + "title", title); + } + for(String type : dcModule.getTypes()) { + createStringProperty(resource, NS_DC_TERMS + "type", type); + } + } + + protected void parseContentModule(Resource resource, ContentModule contentModule) throws RDFHandlerException, RDFParseException { + for(Object content : contentModule.getEncodeds()) { + createStringProperty(resource,NS_RSS_CONTENT + "encoded",(String)content); + } + + // TODO: more sophisticated forms are nowadays rarely used, we do not support them + if(contentModule.getContentItems() != null && contentModule.getContentItems().size() > 0) { + log.warn("content items are not supported yet"); + } + } + + protected void parseGeoModule(Resource resource, GeoRSSModule geoRSSModule) throws RDFParseException, RDFHandlerException { + if(geoRSSModule.getPosition() != null) { + Resource r_location = createBNode(); + Resource t_adr = createURI(NS_GEO + "Point"); + URI p_type = createURI(NS_RDF + "type"); + rdfHandler.handleStatement(createStatement(r_location,p_type,t_adr)); + + createDoubleProperty(r_location,NS_GEO+"latitude",geoRSSModule.getPosition().getLatitude()); + createDoubleProperty(r_location,NS_GEO+"longitude",geoRSSModule.getPosition().getLongitude()); + + + rdfHandler.handleStatement(createStatement(resource,createURI(NS_DC_TERMS + "spatial"),r_location)); + + } + } + + protected void parseMediaModule(Resource resource, MediaEntryModule mediaEntryModule) throws RDFParseException, RDFHandlerException { + for(MediaContent content : mediaEntryModule.getMediaContents()) { + if(content.getReference() != null && content.getReference() instanceof UrlReference) { + URI r_media = createURI(((UrlReference) content.getReference()).getUrl().toString()); + rdfHandler.handleStatement(createStatement(r_media, createURI(NS_RDF + "type"), createURI(NS_MA + "MediaResource"))); + rdfHandler.handleStatement(createStatement(r_media, createURI(NS_MA + "locator"), r_media)); + + if(content.getBitrate() != null) + createDoubleProperty(r_media, NS_MA + "averageBitRate", content.getBitrate()); + if(content.getDuration() != null) + createLongProperty(r_media, NS_MA + "duration", content.getDuration()); + + createStringProperty(r_media, NS_MA + "hasFormat", content.getType()); + + if(content.getFramerate() != null) + createDoubleProperty(r_media, NS_MA + "frameRate", content.getFramerate()); + + if(content.getHeight() != null) + createIntProperty(r_media, NS_MA + "frameHeight", content.getHeight()); + if(content.getWidth() != null) + createIntProperty(r_media, NS_MA + "frameWidth", content.getWidth()); + + createStringProperty(r_media, NS_MA + "hasLanguage", content.getLanguage()); + + if(content.getMetadata() != null) { + Metadata metadata = content.getMetadata(); + + createStringProperty(r_media, NS_MA + "title", metadata.getTitle()); + createStringProperty(r_media, NS_MA + "copyright", metadata.getCopyright()); + createStringProperty(r_media, NS_MA + "description", metadata.getDescription()); + + for(String keyword : metadata.getKeywords()) { + createStringProperty(r_media, NS_MA + "hasKeyword", keyword); + } + + } + + rdfHandler.handleStatement(createStatement(resource, createURI(NS_SIOC+"hasPart"), r_media)); + } + } + + } + + + + protected void parseDCSubject(Resource resource, DCSubject category) throws RDFHandlerException, RDFParseException { + if(category.getValue() == null) { + return; + } + + try { + Resource skosConcept; + if(category.getTaxonomyUri() != null && category.getValue() != null) { + // create a skos:Concept with the domain as namespace and a local name derived from the value, add it as sioc:topic + String localName = URLEncoder.encode(category.getValue(),"UTF-8"); + String namespace = category.getTaxonomyUri(); + skosConcept = createURI(namespace+(namespace.endsWith("/") || namespace.endsWith("#")?"":"/")+localName); + } else { + // create a skos:Concept with the baseUri as namespace and a local name derived from the value, add it as sioc:topic + String localName = URLEncoder.encode(category.getValue(), "UTF-8"); + skosConcept = resolveURI(localName); + } + createUrlProperty(skosConcept,NS_RDF + "type", NS_SKOS+"Concept"); + createStringProperty(skosConcept, NS_SKOS + "prefLabel", category.getValue()); + rdfHandler.handleStatement(createStatement(resource,createURI(NS_SIOC + "topic"),skosConcept)); + } catch (UnsupportedEncodingException e) { + throw new RDFParseException(e); + } + + + // add category value as dc:subject + createStringProperty(resource, NS_DC_TERMS + "subject", category.getValue()); + + } + + + protected void createStringProperty(Resource resource, String rdfProperty, String value) throws RDFParseException, RDFHandlerException { + if(value != null && !"".equals(value.trim())) { + URI p_description = createURI(rdfProperty); + Literal v_description = createLiteral(value, null, null); + rdfHandler.handleStatement(createStatement(resource,p_description,v_description)); + } + } + + protected void createIntProperty(Resource resource, String rdfProperty, int value) throws RDFParseException, RDFHandlerException { + URI p_description = createURI(rdfProperty); + Literal v_description = createLiteral(""+value, null, createURI("http://www.w3.org/2001/XMLSchema#int")); + rdfHandler.handleStatement(createStatement(resource,p_description,v_description)); + } + + protected void createLongProperty(Resource resource, String rdfProperty, long value) throws RDFParseException, RDFHandlerException { + URI p_description = createURI(rdfProperty); + Literal v_description = createLiteral(""+value, null, createURI("http://www.w3.org/2001/XMLSchema#long")); + rdfHandler.handleStatement(createStatement(resource,p_description,v_description)); + } + + protected void createDoubleProperty(Resource resource, String rdfProperty, double value) throws RDFParseException, RDFHandlerException { + URI p_description = createURI(rdfProperty); + Literal v_description = createLiteral(""+value, null, createURI("http://www.w3.org/2001/XMLSchema#double")); + rdfHandler.handleStatement(createStatement(resource,p_description,v_description)); + } + + + protected void createDateProperty(Resource resource, String rdfProperty, Date value) throws RDFParseException, RDFHandlerException { + if(value != null) { + URI p_dateprop = createURI(rdfProperty); + Literal v_dateprop = valueFactory.createLiteral(getXMLCalendar(value,null)); + rdfHandler.handleStatement(createStatement(resource,p_dateprop,v_dateprop)); + } + } + + + protected void createUrlProperty(Resource resource, String rdfProperty, String value) throws RDFParseException, RDFHandlerException { + if(value != null) { + URI p_description = createURI(rdfProperty); + URI v_description = createURI(value); + rdfHandler.handleStatement(createStatement(resource,p_description,v_description)); + } + } + + protected void createUrlProperty(Resource resource, String rdfProperty, Resource value) throws RDFParseException, RDFHandlerException { + if(value != null) { + URI p_description = createURI(rdfProperty); + rdfHandler.handleStatement(createStatement(resource,p_description,value)); + } + } + + + protected static XMLGregorianCalendar getXMLCalendar(Date date, TimeZone timezone) { + GregorianCalendar c = new GregorianCalendar(); + c.setTime(date); + if(timezone != null) + c.setTimeZone(timezone); + try { + return DatatypeFactory.newInstance().newXMLGregorianCalendar(c); + } catch (DatatypeConfigurationException e) { + return null; + } + } + + + +}
http://git-wip-us.apache.org/repos/asf/marmotta/blob/00c22e7c/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/RSSParser.java ---------------------------------------------------------------------- diff --git a/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/RSSParser.java b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/RSSParser.java new file mode 100644 index 0000000..34a98ce --- /dev/null +++ b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/RSSParser.java @@ -0,0 +1,387 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.marmotta.commons.sesame.rio.rss; + +import com.google.common.base.Preconditions; +import com.sun.syndication.feed.WireFeed; +import com.sun.syndication.feed.module.DCModule; +import com.sun.syndication.feed.module.Module; +import com.sun.syndication.feed.module.SyModule; +import com.sun.syndication.feed.rss.Category; +import com.sun.syndication.feed.rss.Channel; +import com.sun.syndication.feed.rss.Enclosure; +import com.sun.syndication.feed.rss.Item; +import com.sun.syndication.io.FeedException; +import com.sun.syndication.io.WireFeedInput; + +import org.apache.marmotta.commons.sesame.rio.rss.RSSFormat; +import org.openrdf.model.Resource; +import org.openrdf.model.URI; +import org.openrdf.model.ValueFactory; +import org.openrdf.model.impl.ValueFactoryImpl; +import org.openrdf.rio.RDFFormat; +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.RDFParseException; +import org.rometools.feed.module.content.ContentModule; +import org.rometools.feed.module.georss.GeoRSSModule; +import org.rometools.feed.module.mediarss.MediaEntryModule; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.InputSource; + +import java.io.*; +import java.net.URLEncoder; + +/** + * Parse RSS feed into RDF. Uses the following vocabularies: + * <ul> + * <li>dcterms for representing most metadata about feeds and entries</li> + * <li>sioc for type information and relation between concepts</li> + * <li>skos for representing categories associated with items or channels</li> + * <li>media ontology for representing information from the mediarss extension</li> + * <li>wgs84 geo ontology for representing information from the georss extension</li> + * </ul> + * RSS properties without a good corresponding vocabulary are copied 1:1 using the rss namespace itself. + * <p/> + * Author: Sebastian Schaffert + */ +public final class RSSParser extends FeedParserBase { + + private static Logger log = LoggerFactory.getLogger(RSSParser.class); + + + /** + * Creates a new RDFParserBase that will use a {@link org.openrdf.model.impl.ValueFactoryImpl} to + * create RDF model objects. + */ + public RSSParser() { + this(new ValueFactoryImpl()); + } + + /** + * Creates a new RDFParserBase that will use the supplied ValueFactory to + * create RDF model objects. + * + * @param valueFactory A ValueFactory. + */ + public RSSParser(ValueFactory valueFactory) { + super(valueFactory); + this.valueFactory = valueFactory; + } + + + + /** + * Gets the RDF format that this parser can parse. + */ + @Override + public RDFFormat getRDFFormat() { + return RSSFormat.FORMAT; + } + + + /** + * Parses the data from the supplied InputStream, using the supplied baseURI + * to resolve any relative URI references. + * + * @param in The InputStream from which to read the data. + * @param baseURI The URI associated with the data in the InputStream. + * @throws java.io.IOException If an I/O error occurred while data was read from the InputStream. + * @throws org.openrdf.rio.RDFParseException + * If the parser has found an unrecoverable parse error. + * @throws org.openrdf.rio.RDFHandlerException + * If the configured statement handler has encountered an + * unrecoverable error. + */ + @Override + public void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException { + Preconditions.checkNotNull(baseURI); + + setBaseURI(baseURI); + + WireFeedInput input = new WireFeedInput(); + try { + WireFeed feed = input.build(new InputSource(in)); + if(feed instanceof Channel) { + parseFeed((Channel) feed); + } else { + throw new RDFParseException("data stream is not an RSS feed"); + } + } catch (FeedException e) { + throw new RDFParseException(e); + } + } + + /** + * Parses the data from the supplied Reader, using the supplied baseURI to + * resolve any relative URI references. + * + * @param reader The Reader from which to read the data. + * @param baseURI The URI associated with the data in the InputStream. + * @throws java.io.IOException If an I/O error occurred while data was read from the InputStream. + * @throws org.openrdf.rio.RDFParseException + * If the parser has found an unrecoverable parse error. + * @throws org.openrdf.rio.RDFHandlerException + * If the configured statement handler has encountered an + * unrecoverable error. + */ + @Override + public void parse(Reader reader, String baseURI) throws IOException, RDFParseException, RDFHandlerException { + Preconditions.checkNotNull(baseURI); + + setBaseURI(baseURI); + + WireFeedInput input = new WireFeedInput(); + try { + WireFeed feed = input.build(reader); + if(feed instanceof Channel) { + parseFeed((Channel) feed); + } else { + throw new RDFParseException("data stream is not an RSS feed"); + } + } catch (FeedException e) { + throw new RDFParseException(e); + } + } + + + + private void parseFeedEntry(final Item entry, final Resource r_feed) throws RDFParseException, RDFHandlerException { + + final String entryURI = entry.getUri() != null ? entry.getUri() : entry.getLink(); + + URI r_entry = createURI(entryURI); + URI rdf_type = createURI(NS_RDF + "type"); + + + // add type sioc:Post + rdfHandler.handleStatement(createStatement(r_entry, rdf_type, createURI(NS_SIOC + "Post"))); + + // add as sioc:container_of from parent feed + rdfHandler.handleStatement(createStatement(r_feed, createURI(NS_SIOC + "container_of"), r_entry)); + rdfHandler.handleStatement(createStatement(r_entry, createURI(NS_SIOC + "has_container"), r_feed)); + + createStringProperty(r_entry, NS_DC_TERMS + "creator", entry.getAuthor()); + + for(Object category : entry.getCategories()) { + parseCategory(r_entry, (Category)category); + } + + createUrlProperty(r_entry, NS_SIOC + "has_discussion", entry.getComments()); + + if(entry.getContent() != null) { + createStringProperty(r_entry, NS_RSS_CONTENT + "encoded", entry.getContent().getValue()); + createStringProperty(r_entry, NS_RSS_CONTENT + "format", entry.getContent().getType()); + } + + if(entry.getDescription() != null) { + createStringProperty(r_entry, NS_DC_TERMS + "description", entry.getDescription().getValue()); + } + + // enclosures relate items to media resources used; we use dcterms:hasPart to link to them + for(Enclosure enclosure : entry.getEnclosures()) { + createUrlProperty(r_entry, NS_DC_TERMS + "hasPart", enclosure.getUrl()); + } + + // for the expiration date we use dc:valid; it is a bit underspecified :-( + createDateProperty(r_entry, NS_DC_TERMS + "valid", entry.getExpirationDate()); + + // GUID is sometimes a URL but the documentation says this cannot be guaranteed, so we use dc:identifier + createStringProperty(r_entry, NS_DC_TERMS + "identifier", entry.getGuid().getValue()); + + // for the link we use sioc:link + createUrlProperty(r_entry, NS_SIOC + "link", entry.getLink()); + + for(Module module : entry.getModules()) { + if(module instanceof DCModule) { + parseDCModule(r_entry, (DCModule)module); + } else if(module instanceof GeoRSSModule) { + parseGeoModule(r_entry, (GeoRSSModule)module); + } else if(module instanceof MediaEntryModule) { + parseMediaModule(r_entry, (MediaEntryModule)module); + } else if(module instanceof ContentModule) { + parseContentModule(r_entry, (ContentModule)module); + } else { + log.warn("module {} not supported yet", module.getUri()); + } + + // TODO: add support for more modules! + } + + // publication date is dc:issued + createDateProperty(r_entry, NS_DC_TERMS + "issued", entry.getPubDate()); + + // if the source is present, we link just to the URL using dc:source and ignore the text + if(entry.getSource() != null) + createUrlProperty(r_entry, NS_DC_TERMS + "source", entry.getSource().getUrl()); + + // title is dc:title + createStringProperty(r_entry, NS_DC_TERMS + "title", entry.getTitle()); + + log.debug("parsed RSS item {}", r_entry.stringValue()); + } + + /** + * Import data from an RSS or atom feed using the ROME SyndFeed representation. + * + * @param feed the ROME rss/atom feed representation + * @return count of imported documents + */ + private void parseFeed(final Channel feed) throws RDFParseException, RDFHandlerException { + if (log.isInfoEnabled()) { + log.info("importing entries from {} feed '{}' found at '{}'",new Object[] {feed.getFeedType(),feed.getTitle(),feed.getUri()}); + } + + final String feedUri = feed.getUri() != null ? feed.getUri() : feed.getLink(); + if (feedUri == null) { + log.error("feed '{}' has neither uri nor link to reference", feed.getTitle()); + return; + } + + // we set some namespaces first + setNamespace(NS_DC_TERMS,"dcterms"); + setNamespace(NS_RSS_SY,"sy"); + setNamespace(NS_RSS_CONTENT,"content"); + setNamespace(NS_SIOC,"sioc"); + + URI r_feed = createURI(feedUri); + URI rdf_type = createURI(NS_RDF + "type"); + + // add type sioc:Forum + rdfHandler.handleStatement(createStatement(r_feed, rdf_type, createURI(NS_SIOC + "Forum"))); + createUrlProperty(r_feed, NS_SIOC + "feed", feedUri); + + // add all categories that are present + for(Category category : feed.getCategories()) { + parseCategory(r_feed,category); + } + + // if feed.getCloud() present, we add its specifications using the RSS namespace + if(feed.getCloud() != null) { + createStringProperty(r_feed, NS_RSS + "cloudUpdateProtocol", feed.getCloud().getProtocol()); + createStringProperty(r_feed, NS_RSS + "cloudUpdateDomain", feed.getCloud().getDomain()); + createStringProperty(r_feed, NS_RSS + "cloudUpdatePath", feed.getCloud().getPath()); + createStringProperty(r_feed, NS_RSS + "cloudUpdateProcedure", feed.getCloud().getRegisterProcedure()); + createIntProperty(r_feed, NS_RSS + "cloudUpdatePort", feed.getCloud().getPort()); + } + + // add dc:rights for feed.getCopyright() + createStringProperty(r_feed, NS_DC_TERMS + "rights", feed.getCopyright()); + + // add dc:description for feed.getDescription() + createStringProperty(r_feed, NS_DC_TERMS + "description", feed.getDescription()); + + // ignore feed.getDocs() + + // add dc:creator to point to the software used for generating feed + createStringProperty(r_feed, NS_DC_TERMS + "provenance", feed.getGenerator()); + + // add foaf:depiction in case there is an image + if(feed.getImage() != null) + createUrlProperty(r_feed, NS_FOAF + "depiction", feed.getImage().getUrl()); + + // add all feed items + for(Item item : feed.getItems()) { + parseFeedEntry(item, r_feed); + } + + // add dc:language for feed.getLanguage() + createStringProperty(r_feed, NS_DC_TERMS + "language", feed.getLanguage()); + + // add dc:created for getLastBuildDate() + createDateProperty(r_feed, NS_DC_TERMS + "created", feed.getLastBuildDate()); + + // add sioc:link for getLink() + createUrlProperty(r_feed, NS_SIOC + "link", feed.getLink()); + + // add dc:creator for managing editor + createStringProperty(r_feed, NS_DC_TERMS + "creator", feed.getManagingEditor()); + + for(Module module : feed.getModules()) { + if(module instanceof SyModule) { + SyModule syModule = (SyModule)module; + createStringProperty(r_feed,NS_RSS_SY + "updatePeriod", syModule.getUpdatePeriod()); + createIntProperty(r_feed, NS_RSS_SY + "updateFrequency", syModule.getUpdateFrequency()); + createDateProperty(r_feed, NS_RSS_SY + "updateBase", syModule.getUpdateBase()); + } else if(module instanceof DCModule) { + parseDCModule(r_feed, (DCModule)module); + } + } + + // create publication date as dc:issued + createDateProperty(r_feed, NS_DC_TERMS+"issued",feed.getPubDate()); + + // PICS is superseded and there is no proper RDF way to do it, so we use an RSS property + createStringProperty(r_feed, NS_RSS + "rating", feed.getRating()); + + // skip days are also added using RSS vocabulary, they are actually syndication info + for(String day : feed.getSkipDays()) { + createStringProperty(r_feed, NS_RSS + "skipDay", day); + } + for(Integer hour : feed.getSkipHours()) { + createIntProperty(r_feed, NS_RSS + "skipHour", hour); + } + + // textinput: we skip it, the documentation says: + // "The purpose of the <textInput> element is something of a mystery. You can use it to specify a + // search engine box. Or to allow a reader to provide feedback. Most aggregators ignore it. " + + createStringProperty(r_feed, NS_DC_TERMS + "title", feed.getTitle()); + + // ttl is again meta information about the syndication, we use the RSS namespace + if(feed.getTtl() > 0) + createIntProperty(r_feed, NS_RSS + "ttl", feed.getTtl()); + + // add dc:publisher for webmaster + createStringProperty(r_feed, NS_DC_TERMS + "publisher", feed.getWebMaster()); + + log.info("importing RSS feed finished successfully."); + } + + protected void parseCategory(Resource resource, Category category) throws RDFHandlerException, RDFParseException { + if(category.getValue() == null) { + return; + } + + try { + Resource skosConcept; + if(category.getDomain() != null && category.getValue() != null) { + // create a skos:Concept with the domain as namespace and a local name derived from the value, add it as sioc:topic + String localName = URLEncoder.encode(category.getValue(), "UTF-8"); + String namespace = category.getDomain(); + skosConcept = createURI(namespace+(namespace.endsWith("/") || namespace.endsWith("#")?"":"/")+localName); + } else { + // create a skos:Concept with the baseUri as namespace and a local name derived from the value, add it as sioc:topic + String localName = URLEncoder.encode(category.getValue(), "UTF-8"); + skosConcept = resolveURI(localName); + } + createUrlProperty(skosConcept,NS_RDF + "type", NS_SKOS+"Concept"); + createStringProperty(skosConcept, NS_SKOS + "prefLabel", category.getValue()); + rdfHandler.handleStatement(createStatement(resource,createURI(NS_SIOC + "topic"),skosConcept)); + } catch (UnsupportedEncodingException e) { + throw new RDFParseException(e); + } + + + // add category value as dc:subject + createStringProperty(resource, NS_DC_TERMS + "subject", category.getValue()); + + } + + +} http://git-wip-us.apache.org/repos/asf/marmotta/blob/00c22e7c/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/RSSParserFactory.java ---------------------------------------------------------------------- diff --git a/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/RSSParserFactory.java b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/RSSParserFactory.java new file mode 100644 index 0000000..d0226f1 --- /dev/null +++ b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/java/org/apache/marmotta/commons/sesame/rio/rss/RSSParserFactory.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.marmotta.commons.sesame.rio.rss; + +import org.apache.marmotta.commons.sesame.rio.rss.RSSFormat; +import org.openrdf.rio.RDFFormat; +import org.openrdf.rio.RDFParser; +import org.openrdf.rio.RDFParserFactory;import java.lang.Override; + +/** + * Create parsers that can transform RSS into RDF using FOAF and SIOC vocabularies + * <p/> + * Author: Sebastian Schaffert + */ +public class RSSParserFactory implements RDFParserFactory { + + /** + * Returns the RDF format for this factory. + */ + @Override + public RDFFormat getRDFFormat() { + return RSSFormat.FORMAT; + } + + /** + * Returns a RDFParser instance. + */ + @Override + public RDFParser getParser() { + return new RSSParser(); + } +} http://git-wip-us.apache.org/repos/asf/marmotta/blob/00c22e7c/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/resources/META-INF/NOTICE ---------------------------------------------------------------------- diff --git a/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/resources/META-INF/NOTICE b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/resources/META-INF/NOTICE new file mode 100644 index 0000000..9893868 --- /dev/null +++ b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/resources/META-INF/NOTICE @@ -0,0 +1,11 @@ +Apache Marmotta RSS +Copyright 2012-2013 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Portions of this software were originally based on the following: + + Copyright 2008-2012 Salzburg Research Forschungsgesellschaft mbH + +These have been licensed to the Apache Software Foundation under a software grant. http://git-wip-us.apache.org/repos/asf/marmotta/blob/00c22e7c/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/resources/META-INF/services/org.openrdf.rio.RDFParserFactory ---------------------------------------------------------------------- diff --git a/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/resources/META-INF/services/org.openrdf.rio.RDFParserFactory b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/resources/META-INF/services/org.openrdf.rio.RDFParserFactory new file mode 100644 index 0000000..79efc90 --- /dev/null +++ b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/main/resources/META-INF/services/org.openrdf.rio.RDFParserFactory @@ -0,0 +1,2 @@ +org.apache.marmotta.commons.sesame.rio.rss.RSSParserFactory +org.apache.marmotta.commons.sesame.rio.rss.AtomParserFactory \ No newline at end of file http://git-wip-us.apache.org/repos/asf/marmotta/blob/00c22e7c/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/java/org/apache/marmotta/commons/sesame/rio/rss/TestAtomParser.java ---------------------------------------------------------------------- diff --git a/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/java/org/apache/marmotta/commons/sesame/rio/rss/TestAtomParser.java b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/java/org/apache/marmotta/commons/sesame/rio/rss/TestAtomParser.java new file mode 100644 index 0000000..a643277 --- /dev/null +++ b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/java/org/apache/marmotta/commons/sesame/rio/rss/TestAtomParser.java @@ -0,0 +1,99 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.marmotta.commons.sesame.rio.rss; + +import static java.util.Arrays.asList; +import static org.hamcrest.CoreMatchers.everyItem; +import static org.hamcrest.CoreMatchers.notNullValue; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.junit.Assume.assumeThat; + +import org.apache.commons.io.IOUtils; +import org.apache.marmotta.commons.sesame.rio.rss.AtomFormat; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.openrdf.query.BooleanQuery; +import org.openrdf.query.QueryLanguage; +import org.openrdf.repository.Repository; +import org.openrdf.repository.RepositoryConnection; +import org.openrdf.repository.sail.SailRepository; +import org.openrdf.sail.memory.MemoryStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import info.aduna.iteration.Iterations; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collection; + +/** + * Add file description here! + * <p/> + * Author: Sebastian Schaffert + */ +@RunWith(Parameterized.class) +public class TestAtomParser { + + private static Logger log = LoggerFactory.getLogger(TestAtomParser.class); + + private String fileName; + + public TestAtomParser(String fileName) { + this.fileName = fileName; + } + + @Parameterized.Parameters(name = "{0}") + public static Collection<Object[]> data() { + ArrayList<Object[]> list = new ArrayList<Object[]>(); + list.add(new Object[] { "iks-blog" }); + return list; + } + + + + @Test + public void runTest() throws Exception { + log.info("running test {} ...", fileName); + + InputStream atom = this.getClass().getResourceAsStream(fileName + ".atom"); + InputStream sparql = this.getClass().getResourceAsStream(fileName+".sparql"); + assumeThat("Could not load testfiles", asList(atom, sparql), everyItem(notNullValue(InputStream.class))); + + Repository repository = new SailRepository(new MemoryStore()); + repository.initialize(); + + RepositoryConnection connection = repository.getConnection(); + try { + connection.add(atom, "http://localhost/atom/", AtomFormat.FORMAT); + connection.commit(); + } catch(Exception ex) { + ex.printStackTrace(); + fail("parsing "+fileName+" failed!"); + } + assertTrue(connection.size() > 0); + + int count = Iterations.asList(connection.getStatements(null, null, null, false)).size(); + assertTrue(count > 0); + + BooleanQuery sparqlQuery = (BooleanQuery)connection.prepareQuery(QueryLanguage.SPARQL, IOUtils.toString(sparql).replaceAll("http://rdfa.digitalbazaar.com/test-suite/test-cases/xhtml1/rdfa1.1/","http://localhost/rdfa/")); + assertTrue("SPARQL query evaluation for "+fileName+" failed",sparqlQuery.evaluate()); + + connection.close(); + repository.shutDown(); + } + +} http://git-wip-us.apache.org/repos/asf/marmotta/blob/00c22e7c/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/java/org/apache/marmotta/commons/sesame/rio/rss/TestRSSParser.java ---------------------------------------------------------------------- diff --git a/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/java/org/apache/marmotta/commons/sesame/rio/rss/TestRSSParser.java b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/java/org/apache/marmotta/commons/sesame/rio/rss/TestRSSParser.java new file mode 100644 index 0000000..7454c50 --- /dev/null +++ b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/java/org/apache/marmotta/commons/sesame/rio/rss/TestRSSParser.java @@ -0,0 +1,99 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.marmotta.commons.sesame.rio.rss; + +import static java.util.Arrays.asList; +import static org.hamcrest.CoreMatchers.everyItem; +import static org.hamcrest.CoreMatchers.notNullValue; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.junit.Assume.assumeThat; + +import org.apache.commons.io.IOUtils; +import org.apache.marmotta.commons.sesame.rio.rss.RSSFormat; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.openrdf.query.BooleanQuery; +import org.openrdf.query.QueryLanguage; +import org.openrdf.repository.Repository; +import org.openrdf.repository.RepositoryConnection; +import org.openrdf.repository.sail.SailRepository; +import org.openrdf.sail.memory.MemoryStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import info.aduna.iteration.Iterations; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collection; + +/** + * Add file description here! + * <p/> + * Author: Sebastian Schaffert + */ +@RunWith(Parameterized.class) +public class TestRSSParser { + + private static Logger log = LoggerFactory.getLogger(TestRSSParser.class); + + private String fileName; + + public TestRSSParser(String fileName) { + this.fileName = fileName; + } + + @Parameterized.Parameters(name = "{0}") + public static Collection<Object[]> data() { + ArrayList<Object[]> list = new ArrayList<Object[]>(); + list.add(new Object[] { "iks-blog" }); + return list; + } + + + + @Test + public void runTest() throws Exception { + log.info("running test {} ...", fileName); + + InputStream rss = this.getClass().getResourceAsStream(fileName + ".rss"); + InputStream sparql = this.getClass().getResourceAsStream(fileName+".sparql"); + assumeThat("Could not load testfiles", asList(rss, sparql), everyItem(notNullValue(InputStream.class))); + + Repository repository = new SailRepository(new MemoryStore()); + repository.initialize(); + + RepositoryConnection connection = repository.getConnection(); + try { + connection.add(rss, "http://localhost/rss/", RSSFormat.FORMAT); + connection.commit(); + } catch(Exception ex) { + ex.printStackTrace(); + fail("parsing "+fileName+" failed!"); + } + assertTrue(connection.size() > 0); + + int count = Iterations.asList(connection.getStatements(null, null, null, false)).size(); + assertTrue(count > 0); + + BooleanQuery sparqlQuery = (BooleanQuery)connection.prepareQuery(QueryLanguage.SPARQL, IOUtils.toString(sparql).replaceAll("http://rdfa.digitalbazaar.com/test-suite/test-cases/xhtml1/rdfa1.1/","http://localhost/rdfa/")); + assertTrue("SPARQL query evaluation for "+fileName+" failed",sparqlQuery.evaluate()); + + connection.close(); + repository.shutDown(); + } + +} http://git-wip-us.apache.org/repos/asf/marmotta/blob/00c22e7c/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/resources/logback.xml ---------------------------------------------------------------------- diff --git a/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/resources/logback.xml b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/resources/logback.xml new file mode 100644 index 0000000..1bfecff --- /dev/null +++ b/commons/marmotta-sesame-tools/marmotta-rio-rss/src/test/resources/logback.xml @@ -0,0 +1,27 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<configuration> + <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> + <encoder> + <pattern>%d{HH:mm:ss.SSS} %highlight(%level) %cyan(%logger{15}) - %m%n</pattern> + </encoder> + </appender> + <root level="${root-level:-INFO}"> + <appender-ref ref="CONSOLE"/> + </root> +</configuration> \ No newline at end of file
