In a previous thread there was a talk about URI's / IRI
i created a patch that the encoding can be selected in the configuration
file
i tested it and i believe it works fine... but i wouldn't mind for a second
opinion...
i 'd like some help on another issue:
i am trying to create a new option where the resources will be created with
their original name (not the English translation)
can someone point me to the source files that deal with it?
Regards,
Kontokostas Dimitris
### Eclipse Workspace Patch 1.0
#P DBPedia
Index: dump/src/main/scala/org/dbpedia/extraction/dump/ConfigLoader.scala
===================================================================
--- dump/src/main/scala/org/dbpedia/extraction/dump/ConfigLoader.scala (revision 3807)
+++ dump/src/main/scala/org/dbpedia/extraction/dump/ConfigLoader.scala (working copy)
@@ -1,6 +1,6 @@
package org.dbpedia.extraction.dump
-import _root_.org.dbpedia.extraction.destinations.formatters.{NTriplesFormatter, NQuadsFormatter}
+import _root_.org.dbpedia.extraction.destinations.formatters.{NTriplesFormatter, NQuadsFormatter, NTriplesFormatterAsIRI, NQuadsFormatterAsIRI }
import _root_.org.dbpedia.extraction.destinations.{FileDestination, CompositeDestination}
import _root_.org.dbpedia.extraction.mappings._
import java.net.URL
@@ -49,7 +49,7 @@
//Load languages
if(config.getProperty("languages") == null) throw new IllegalArgumentException("Property 'languages' not defined.")
val languages = config.getProperty("languages").split("\\s+").map(_.trim).toList
-
+
//Load property updateDumps
val update = Option(config.getProperty("updateDumps")).getOrElse(return false).trim.toLowerCase match
{
@@ -84,8 +84,17 @@
val extractor = Extractor.load(config.ontologySource, mappingsSource, config.commonsSource, articlesSource, config.extractors(language), language)
//Destination
- val tripleDestination = new FileDestination(new NTriplesFormatter(), config.outputDir, dataset => language.filePrefix + "/" + dataset.name + "_" + language.filePrefix + ".nt")
- val quadDestination = new FileDestination(new NQuadsFormatter(), config.outputDir, dataset => language.filePrefix + "/" + dataset.name + "_" + language.filePrefix + ".nq")
+ val tripleDestination =
+ if (config.usePercentEncoding )
+ new FileDestination(new NTriplesFormatter(), config.outputDir, dataset => language.filePrefix + "/" + dataset.name + "_" + language.filePrefix + ".nt")
+ else
+ new FileDestination(new NTriplesFormatterAsIRI(), config.outputDir, dataset => language.filePrefix + "/" + dataset.name + "_" + language.filePrefix + ".nt")
+ val quadDestination =
+ if (config.usePercentEncoding )
+ new FileDestination(new NTriplesFormatter(), config.outputDir, dataset => language.filePrefix + "/" + dataset.name + "_" + language.filePrefix + ".nt")
+ else
+ new FileDestination(new NQuadsFormatterAsIRI(), config.outputDir, dataset => language.filePrefix + "/" + dataset.name + "_" + language.filePrefix + ".nq")
+
val destination = new CompositeDestination(tripleDestination, quadDestination)
new ExtractionJob(extractor, articlesSource, destination, "Extraction Job for " + language.wikiCode + " Wikipedia")
@@ -100,7 +109,15 @@
/** Output directory */
if(config.getProperty("outputDir") == null) throw new IllegalArgumentException("Property 'outputDir' not defined.")
val outputDir = new File(config.getProperty("outputDir"))
-
+
+ /** Persent Encoding */
+ if(config.getProperty("usePercentEncoding") == null) throw new IllegalArgumentException("Property 'usePercentEncoding' not defined.")
+ val usePercentEncoding = config.getProperty("usePercentEncoding").trim.toLowerCase match
+ {
+ case BooleanLiteral(b) => b
+ case _ => throw new IllegalArgumentException("Invalid value for property 'usePercentEncoding'")
+ }
+
/** Languages */
if(config.getProperty("languages") == null) throw new IllegalArgumentException("Property 'languages' not defined.")
private val languages = config.getProperty("languages").split("\\s+").map(_.trim).toList
Index: dump/config.properties.default
===================================================================
--- dump/config.properties.default (revision 3807)
+++ dump/config.properties.default (working copy)
@@ -1,6 +1,7 @@
dumpDir=d:/wikipediaDump
outputDir=d:/output
updateDumps=true
+usePercentEncoding=true
extractors=org.dbpedia.extraction.mappings.LabelExtractor \
org.dbpedia.extraction.mappings.WikiPageExtractor \
Index: core/src/main/scala/org/dbpedia/extraction/destinations/formatters/NQuadsFormatterAsIRI.scala
===================================================================
--- core/src/main/scala/org/dbpedia/extraction/destinations/formatters/NQuadsFormatterAsIRI.scala (revision 0)
+++ core/src/main/scala/org/dbpedia/extraction/destinations/formatters/NQuadsFormatterAsIRI.scala (revision 0)
@@ -0,0 +1,16 @@
+package org.dbpedia.extraction.destinations.formatters
+
+import java.io.Writer
+import org.dbpedia.extraction.destinations.{Quad, Formatter}
+
+/**
+ * Formats statements according to the N-Quads format.
+ * See: http://sw.deri.org/2008/07/n-quads/
+ */
+class NQuadsFormatterAsIRI extends Formatter
+{
+ override def write(quad : Quad, writer : Writer) : Unit =
+ {
+ writer.write(quad.renderNQuadAsIRI + "\n")
+ }
+}
Property changes on: core/src/main/scala/org/dbpedia/extraction/destinations/formatters/NQuadsFormatterAsIRI.scala
___________________________________________________________________
Added: svn:executable
+ *
Index: core/src/main/scala/org/dbpedia/extraction/destinations/Quad.scala
===================================================================
--- core/src/main/scala/org/dbpedia/extraction/destinations/Quad.scala (revision 3807)
+++ core/src/main/scala/org/dbpedia/extraction/destinations/Quad.scala (working copy)
@@ -2,6 +2,7 @@
import org.dbpedia.extraction.ontology.datatypes.Datatype
import java.net.URI
+import java.net.URLDecoder
import org.dbpedia.extraction.mappings.ExtractionContext
import org.dbpedia.extraction.ontology.{OntologyProperty}
import java.io.CharConversionException
@@ -40,26 +41,41 @@
context : String,
datatype : Datatype = null ) = this(extractionContext, dataset, subject, Quad.validatePredicate(predicate, datatype), value, context, Quad.getType(predicate, datatype))
- def renderNTriple = render(false)
+ def renderNTriple = render(false, false)
+ def renderNTripleAsIRI = render(false, true)
- def renderNQuad = render(true)
+ def renderNQuad = render(true, false)
+ def renderNQuadAsIRI = render(true, true)
override def toString = renderNQuad
- private def render(includeContext : Boolean) : String =
+ private def render(includeContext : Boolean, encodeAsIRI : Boolean) : String =
{
val sb = new StringBuilder
- sb append "<" append subject append "> "
+ sb append "<"
+ if (encodeAsIRI)
+ toIRIstring(sb, subject)
+ else
+ sb append subject
+ sb append "> "
- sb append "<" append predicate append "> "
+ sb append "<" ;
+ if (encodeAsIRI)
+ toIRIstring(sb, predicate)
+ else
+ sb append predicate
+ sb append "> "
if (datatype != null)
{
if (datatype.uri == "http://www.w3.org/2001/XMLSchema#string")
{
- sb append '"'
- escapeString(sb, value)
+ sb append '"';
+ if (encodeAsIRI)
+ toIRIstring(sb, value)
+ else
+ escapeString(sb, value)
sb append "\""
sb append "@" + extractionContext.language.locale.getLanguage + " "
@@ -67,18 +83,35 @@
else
{
sb append '"'
- escapeString(sb, value)
+ if (encodeAsIRI)
+ toIRIstring(sb, value)
+ else
+ escapeString(sb, value)
sb append "\"^^<" append datatype.uri append "> "
}
}
else
{
- sb append '<' append value append "> "
+ sb append '<'
+
+ if (encodeAsIRI)
+ toIRIstring(sb, value)
+ else
+ sb append value
+
+ sb append "> "
}
if (includeContext)
{
- sb append '<' append context append "> "
+ sb append '<'
+
+ if (encodeAsIRI)
+ toIRIstring(sb, context)
+ else
+ sb append context
+
+ sb append "> "
}
sb append '.'
@@ -148,6 +181,23 @@
}
return sb
}
+
+ /**
+ * Encodes the string according to the IRI format (RFC 3987). N-Triples format can
+ * accept unicode according to http://www.w3.org/2001/sw/RDFCore/ntriples/ Section 5.3
+ */
+ private def toIRIstring(sb : StringBuilder, input : String) : StringBuilder =
+ {
+ //the java.net.URI will include a toIRIString() method in the next release
+ //http://download.oracle.com/javase/tutorial/i18n/network/iri.html
+ //http://download.oracle.com/docs/cd/E17802_01/j2se/javase/6/jcp/beta/apidiffs/java/net/URI.html
+
+ //this is a temporary solution and equivalent to the old php framework
+ //just unescaping, not checking for "unaccepted chars"
+ return sb append URLDecoder.decode(input, "UTF-8");
+
+ }
+
}
object Quad
Index: core/src/main/scala/org/dbpedia/extraction/destinations/formatters/NTriplesFormatterAsIRI.scala
===================================================================
--- core/src/main/scala/org/dbpedia/extraction/destinations/formatters/NTriplesFormatterAsIRI.scala (revision 0)
+++ core/src/main/scala/org/dbpedia/extraction/destinations/formatters/NTriplesFormatterAsIRI.scala (revision 0)
@@ -0,0 +1,16 @@
+package org.dbpedia.extraction.destinations.formatters
+
+import java.io.Writer
+import org.dbpedia.extraction.destinations.{Formatter, Quad}
+
+/**
+ * Formats statements according to the N-Triples format.
+ * See: http://www.w3.org/2001/sw/RDFCore/ntriples/
+ */
+class NTriplesFormatterAsIRI extends Formatter
+{
+ override def write(quad : Quad, writer : Writer) : Unit =
+ {
+ writer.write(quad.renderNTripleAsIRI + "\n")
+ }
+}
Property changes on: core/src/main/scala/org/dbpedia/extraction/destinations/formatters/NTriplesFormatterAsIRI.scala
___________________________________________________________________
Added: svn:executable
+ *
------------------------------------------------------------------------------
Download new Adobe(R) Flash(R) Builder(TM) 4
The new Adobe(R) Flex(R) 4 and Flash(R) Builder(TM) 4 (formerly
Flex(R) Builder(TM)) enable the development of rich applications that run
across multiple browsers and platforms. Download your free trials today!
http://p.sf.net/sfu/adobe-dev2dev
_______________________________________________
Dbpedia-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/dbpedia-discussion