Hi,

i attach a patch for the image extractor to support the greek language,

thanks,

Jim
### Eclipse Workspace Patch 1.0
#P DBPedia
Index: core/src/main/scala/org/dbpedia/extraction/mappings/ImageExtractor.scala
===================================================================
--- core/src/main/scala/org/dbpedia/extraction/mappings/ImageExtractor.scala	(revision 3650)
+++ core/src/main/scala/org/dbpedia/extraction/mappings/ImageExtractor.scala	(working copy)
@@ -16,9 +16,9 @@
 {
     private val language = extractionContext.language.wikiCode
 
-    require(Set("en").contains(language), "ImageExtractor does only support english extraction")
+    require(Set("en", "el").contains(language), "ImageExtractor does only support english extraction")
 
-    private val wikipediaUrlPrefix = "http://upload.wikimedia.org/wikipedia/en/";
+    private val wikipediaUrlPrefix = "http://upload.wikimedia.org/wikipedia/"+ language +"/"
     private val commonsUrlPrefix = "http://upload.wikimedia.org/wikipedia/commons/";
 
     private val nonFreeImages = new HashSet[String]();
@@ -29,8 +29,8 @@
     private val encodedLinkRegex = """%[0-9a-fA-F][0-9a-fA-F]""".r
 
     logger.info("Loadings images")
-    ImageExtractor.loadImages(extractionContext.articlesSource, freeWikipediaImages, nonFreeImages)
-    ImageExtractor.loadImages(extractionContext.commonsSource, null, nonFreeImages)
+    ImageExtractor.loadImages(extractionContext.articlesSource, freeWikipediaImages, nonFreeImages, language)
+    ImageExtractor.loadImages(extractionContext.commonsSource, null, nonFreeImages, language)
     logger.info("Images loaded from dump")
 
     private val dbpediaThumbnailProperty = extractionContext.ontology.getProperty("thumbnail").get
@@ -52,7 +52,9 @@
             quads ::= new Quad(extractionContext, DBpediaDatasets.Images, subjectUri, dbpediaThumbnailProperty, thumbnailUrl, sourceNode.sourceUri)
             quads ::= new Quad(extractionContext, DBpediaDatasets.Images, url, foafThumbnailProperty, thumbnailUrl, sourceNode.sourceUri)
 
-            val wikipediaImageUrl = "http://"; + language + ".wikipedia.org/wiki/Image:" + imageFileName 
+            val wikiLangUrl = Map( "en" -> "Image",
+							        "el" -> "Αρχείο")
+            val wikipediaImageUrl = "http://"; + language + ".wikipedia.org/wiki/"+ wikiLangUrl(language) +":" + imageFileName 
 
             quads ::= new Quad(extractionContext, DBpediaDatasets.Images, url, dcRightsProperty, wikipediaImageUrl, sourceNode.sourceUri)
             quads ::= new Quad(extractionContext, DBpediaDatasets.Images, thumbnailUrl, dcRightsProperty, wikipediaImageUrl, sourceNode.sourceUri)
@@ -174,13 +176,14 @@
 
 private object ImageExtractor
 {
-    val NonFreeRegex = """(?i)\{\{\s?non-free""".r
+    val NonFreeRegex = Map("en" -> """(?i)\{\{\s?non-free""".r,
+							"el" -> """(?i)\{\{\s?(εύλογη χρήση|σήμα|σήμα αθλητικού σωματείου|αφίσα ταινίας|σκηνή από ταινία|γραφικά υπολογιστή|εξώφυλλο άλμπουμ|εξώφυλλο βιβλίου|μη ελεύθερο έργο τέχνης|σελίδα κόμικς|σελίδα εφημερίδας|εικόνα-βιντεοπαιχνίδι|ιδιοκτησία Wikimedia)\s?\}\}""".r )
 
     private val ImageRegex = """(?i)[^\"/\*?<>|:]+\.(?:jpe?g|png|gif|svg)""".r
 
     private val ImageLinkRegex = """(?i).*\.(?:jpe?g|png|gif|svg)""".r
 
-    private def loadImages(source : Source, freeImages : MutableSet[String], nonFreeImages : MutableSet[String]) : Unit =
+    private def loadImages(source : Source, freeImages : MutableSet[String], nonFreeImages : MutableSet[String], lang : String) : Unit =
     {
         val logger = Logger.getLogger(classOf[ImageExtractor].getName)
         val startTime = System.nanoTime
@@ -188,7 +191,7 @@
         for(page <- source if page.title.namespace == WikiTitle.Namespace.File;
             ImageLinkRegex <- List(page.title.encoded) )
         {
-            NonFreeRegex.findFirstIn(page.source) match
+            NonFreeRegex(lang).findFirstIn(page.source) match
             {
                 case Some(_) => nonFreeImages += page.title.encoded
                 case None => if (freeImages != null) freeImages += page.title.encoded
------------------------------------------------------------------------------
Beautiful is writing same markup. Internet Explorer 9 supports
standards for HTML5, CSS3, SVG 1.1,  ECMAScript5, and DOM L2 & L3.
Spend less time writing and  rewriting code and more time creating great
experiences on the web. Be a part of the beta today.
http://p.sf.net/sfu/beautyoftheweb
_______________________________________________
Dbpedia-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/dbpedia-discussion

Reply via email to