add CSV2RDF tool git-svn-id: http://svn.apache.org/repos/asf/jena/Experimental/jena-csv@1613797 13f79535-47bb-0310-9956-ffa450edef68
Project: http://git-wip-us.apache.org/repos/asf/jena/repo Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/5b0eaa4a Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/5b0eaa4a Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/5b0eaa4a Branch: refs/heads/master Commit: 5b0eaa4a5edfc50a225d5500f508a390d32b3dcb Parents: 534d0cf Author: Ying Jiang <[email protected]> Authored: Sun Jul 27 14:24:26 2014 +0000 Committer: Ying Jiang <[email protected]> Committed: Sun Jul 27 14:24:26 2014 +0000 ---------------------------------------------------------------------- src/main/java/riotcmd/LocatorOupputFile.java | 148 ++++++++++++++++ src/main/java/riotcmd/ModDest.java | 51 ++++++ src/main/java/riotcmd/csv2rdf.java | 205 ++++++++++++++++++++++ 3 files changed, 404 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/jena/blob/5b0eaa4a/src/main/java/riotcmd/LocatorOupputFile.java ---------------------------------------------------------------------- diff --git a/src/main/java/riotcmd/LocatorOupputFile.java b/src/main/java/riotcmd/LocatorOupputFile.java new file mode 100644 index 0000000..3d9cc52 --- /dev/null +++ b/src/main/java/riotcmd/LocatorOupputFile.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package riotcmd; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.security.AccessControlException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.hp.hpl.jena.util.FileManager; +import com.hp.hpl.jena.util.FileUtils; +import com.hp.hpl.jena.util.LocatorFile; + +public class LocatorOupputFile { + static Logger log = LoggerFactory.getLogger(LocatorOupputFile.class) ; + private String thisDir = null ; + private String thisDirLogStr = "" ; + + public LocatorOupputFile(String dir) + { + if ( dir != null ) + { + if ( dir.endsWith("/") || dir.endsWith(java.io.File.separator) ) + dir = dir.substring(0,dir.length()-1) ; + thisDirLogStr = " ["+dir+"]" ; + } + thisDir = dir ; + } + + LocatorOupputFile() + { + this(null) ; + } + + @Override + public boolean equals( Object other ) + { + return + other instanceof LocatorFile + && equals( thisDir, ((LocatorOupputFile) other).thisDir ); + } + + private boolean equals( String a, String b ) + { + return a == null ? b == null : a.equals( b ); + } + + @Override + public int hashCode() + { + if ( thisDir == null ) + return 157 ; + return thisDir.hashCode(); + } + + private File toFile(String filenameOrURI) + { + String fn = FileUtils.toFilename(filenameOrURI) ; + if ( fn == null ) + return null ; + + if ( thisDir != null && ! fn.startsWith("/") && ! fn.startsWith(FileManager.filePathSeparator) ) + fn = thisDir+java.io.File.separator+fn ; + + return new File(fn) ; + } + + + public boolean exists(String filenameOrURI) + { + File f = toFile(filenameOrURI) ; + + if ( f == null ) + return false ; + + return f.exists() ; + } + + + public OutputStream open(String filenameOrURI) + { + // Worry about %20. + // toFile calls FileUtils.toFilename(filenameOrURI) ; + File f = toFile(filenameOrURI) ; + + try { + if ( f == null ) + { + if ( log.isTraceEnabled()) + log.trace("Not found: "+filenameOrURI+thisDirLogStr) ; + return null ; + } + } catch (AccessControlException e) { + log.warn("Security problem testing for file", e); + return null; + } + + try { + OutputStream out = new FileOutputStream(f) ; + + if ( log.isTraceEnabled() ) + log.trace("Found: "+filenameOrURI+thisDirLogStr) ; + + + // Create base -- Java 1.4-isms + //base = f.toURI().toURL().toExternalForm() ; + //base = base.replaceFirst("^file:/([^/])", "file:///$1") ; + return out ; + } catch (IOException ioEx) + { + // Includes FileNotFoundException + // We already tested whether the file exists or not. + // log.warn("File unreadable (but exists): "+f.getPath()+" Exception: "+ioEx.getMessage()) ; + return null ; + } + } + + public String getDir() { return thisDir ; } + + + public String getName() + { + String tmp = "LocatorFile" ; + if ( thisDir != null ) + tmp = tmp+"("+thisDir+")" ; + return tmp ; + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/5b0eaa4a/src/main/java/riotcmd/ModDest.java ---------------------------------------------------------------------- diff --git a/src/main/java/riotcmd/ModDest.java b/src/main/java/riotcmd/ModDest.java new file mode 100644 index 0000000..739adcd --- /dev/null +++ b/src/main/java/riotcmd/ModDest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package riotcmd; + +import arq.cmd.CmdException; +import arq.cmdline.ArgDecl; +import arq.cmdline.ArgModuleGeneral; +import arq.cmdline.CmdArgModule; +import arq.cmdline.CmdGeneral; + +public class ModDest implements ArgModuleGeneral{ + + private ArgDecl argDest = new ArgDecl(ArgDecl.HasValue, "dest") ; + private String dest = null ; + + @Override + public void processArgs(CmdArgModule cmdLine) { + if ( cmdLine.contains(argDest) ) { + dest = cmdLine.getValue(argDest) ; + } else { + throw new CmdException("No destination output file! Please add '--dest=file' in the program arguements") ; + } + } + + @Override + public void registerWith(CmdGeneral cmdLine) { + cmdLine.getUsage().startCategory("Destination Output") ; + cmdLine.add(argDest, "--dest=file", "The destination output file") ; + } + + public String getDest() { + return dest ; + } + +} http://git-wip-us.apache.org/repos/asf/jena/blob/5b0eaa4a/src/main/java/riotcmd/csv2rdf.java ---------------------------------------------------------------------- diff --git a/src/main/java/riotcmd/csv2rdf.java b/src/main/java/riotcmd/csv2rdf.java new file mode 100644 index 0000000..882a29a --- /dev/null +++ b/src/main/java/riotcmd/csv2rdf.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package riotcmd; + +import java.io.OutputStream; + +import org.apache.jena.atlas.io.IO; +import org.apache.jena.atlas.web.ContentType; +import org.apache.jena.atlas.web.TypedInputStream; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; +import org.apache.jena.riot.RDFLanguages; +import org.apache.jena.riot.ReaderRIOT; +import org.apache.jena.riot.RiotException; +import org.apache.jena.riot.SysRIOT; +import org.apache.jena.riot.lang.LabelToNode; +import org.apache.jena.riot.lang.StreamRDFCounting; +import org.apache.jena.riot.out.NodeToLabel; +import org.apache.jena.riot.process.inf.InfFactory; +import org.apache.jena.riot.system.ErrorHandler; +import org.apache.jena.riot.system.ErrorHandlerFactory; +import org.apache.jena.riot.system.RiotLib; +import org.apache.jena.riot.system.StreamRDF; +import org.apache.jena.riot.system.StreamRDF2; +import org.apache.jena.riot.system.StreamRDFLib; +import org.apache.jena.riot.system.SyntaxLabels; + +import arq.cmd.CmdException; + +import com.hp.hpl.jena.sparql.util.Utils; + +/** + * It's a command line tool for direct and scalable transforming from CSV to the formatted RDF syntax (i.e. N-Triples), + * with no intermediary Graph or PropertyTable. + * + * It reuses the parsing functions from CmdLangParse and sinks the triples into the destination output file. + * + */ +public class csv2rdf extends CmdLangParse{ + + protected ModDest modDest = new ModDest() ; + protected OutputStream destOut; + + public static void main(String... argv) + { + new csv2rdf(argv).mainRun() ; + } + + protected csv2rdf(String[] argv) + { + super(argv) ; + super.addModule(modDest) ; + + } + + @Override + protected Lang selectLang(String filename, ContentType contentType, + Lang dftLang) { + return RDFLanguages.CSV; + } + + @Override + protected String getCommandName() { + return Utils.classShortName(csv2rdf.class) ; + } + + @Override + protected String getSummary() + { + return getCommandName()+" --dest=outputFile inputFile ..." ; + } + + // override the original CmdLangParse.parseRIOT() + protected void parseRIOT(String baseURI, String filename, TypedInputStream in) + { + + String dest = modDest.getDest(); + LocatorOupputFile l = new LocatorOupputFile(); + destOut = l.open(dest); + + if (destOut == null){ + System.err.println("Can't write to destination output file: '"+dest+"' ") ; + return ; + } + + // I ti s shame we effectively duplicate deciding thelnaguage but we want to control the + // pasrer at a deep level (in validation, we want line numbers get into error message) + // This code predates RDFDataMgr. + + ContentType ct = in.getMediaType() ; + + baseURI = SysRIOT.chooseBaseIRI(baseURI, filename) ; + + boolean checking = true ; + if ( modLangParse.explicitChecking() ) checking = true ; + if ( modLangParse.explicitNoChecking() ) checking = false ; + + ErrorHandler errHandler = null ; + if ( checking ) + { + if ( modLangParse.stopOnBadTerm() ) + errHandler = ErrorHandlerFactory.errorHandlerStd ; + else + // Try to go on if possible. This is the default behaviour. + errHandler = ErrorHandlerFactory.errorHandlerWarn ; + } + + if ( modLangParse.skipOnBadTerm() ) + { + // TODO skipOnBadterm + } + + Lang lang = selectLang(filename, ct, RDFLanguages.NQUADS) ; + LangHandler handler = dispatch.get(lang) ; + if ( handler == null ) + throw new CmdException("Undefined language: "+lang) ; + + // If multiple files, choose the overall labels. + if ( langHandlerOverall == null ) + langHandlerOverall = handler ; + else + { + if ( langHandlerOverall != langHandlerAny ) + { + if ( langHandlerOverall != handler ) + langHandlerOverall = langHandlerAny ; + } + } + + // Make a flag. + // Input and output subflags. + // If input is "label, then output using NodeToLabel.createBNodeByLabelRaw() ; + // else use NodeToLabel.createBNodeByLabel() ; + // Also, as URI. + final boolean labelsAsGiven = false ; + + NodeToLabel labels = SyntaxLabels.createNodeToLabel() ; + if ( labelsAsGiven ) + labels = NodeToLabel.createBNodeByLabelEncoded() ; + + StreamRDF s = StreamRDFLib.sinkNull() ; + if ( ! modLangParse.toBitBucket() ) + s = StreamRDFLib.writer(output) ; + + // add dest output + if ( destOut != null) + s = new StreamRDF2(s, StreamRDFLib.writer(destOut)); + + if ( setup != null ) + s = InfFactory.inf(s, setup) ; + + StreamRDFCounting sink = StreamRDFLib.count(s) ; + s = null ; + + ReaderRIOT reader = RDFDataMgr.createReader(lang) ; + try { + if ( checking ) { + if ( lang == RDFLanguages.NTRIPLES || lang == RDFLanguages.NQUADS ) + reader.setParserProfile(RiotLib.profile(baseURI, false, true, errHandler)) ; + else + reader.setParserProfile(RiotLib.profile(baseURI, true, true, errHandler)) ; + } else + reader.setParserProfile(RiotLib.profile(baseURI, false, false, errHandler)) ; + + if ( labelsAsGiven ) + reader.getParserProfile().setLabelToNode(LabelToNode.createUseLabelAsGiven()) ; + modTime.startTimer() ; + reader.read(in, baseURI, ct, sink, null) ; + } catch (RiotException ex) { + // Should have handled the exception and logged a message by now. + // System.err.println("++++"+ex.getMessage()); + + if ( modLangParse.stopOnBadTerm() ) + return ; + } finally { + // Not close - we may write again to the underlying output stream in another call to parse a file. + sink.finish() ; + IO.close(in) ; + } + long x = modTime.endTimer() ; + long n = sink.countTriples()+sink.countQuads() ; + + if ( modTime.timingEnabled() ) + output(filename, n, x, handler) ; + + totalMillis += x ; + totalTuples += n ; + } +}
