Hi,

i attach 2 patches
1) for infobox extractor that creates invalid uri's for wikiPageUsesTemplate
(for non-english articles)
2) support for more internationalization options in dateTimeParser (taking
in mind the previous bug i created ;)

cheers,
Jim
Index: core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala
===================================================================
--- core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala	(revision 3437)
+++ core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala	(revision )
@@ -9,6 +9,7 @@
 import org.dbpedia.extraction.util.StringUtils._
 import java.net.URLEncoder
 import org.dbpedia.extraction.destinations.{DBpediaDatasets, Graph, Quad}
+import org.dbpedia.extraction.wikiparser.impl.wikipedia.Namespaces
 
 /**
  * This extractor extracts all properties from all infoboxes.
@@ -27,6 +28,8 @@
 
     private val usesTemplateProperty = OntologyNamespaces.DBPEDIA_GENERAL_NAMESPACE + "wikiPageUsesTemplate"
 
+    private val templateNamespace = Namespaces.getNameForNamespace(extractionContext.language, WikiTitle.Namespace.Template)
+
     private val MinPropertyCount = 2
 
     private val MinPercentageOfExplicitPropertyKeys = 0.75
@@ -137,7 +140,7 @@
                 // TODO write only wikiPageUsesTemplate if properties extracted
                 if (propertiesFound && (!seenTemplates.contains(template.title.encoded)))
                 {
-                    quads ::= new Quad(extractionContext, DBpediaDatasets.Infoboxes, subjectUri, usesTemplateProperty, "http://dbpedia.org/resource/Template:"+template.title.encoded, template.sourceUri, null)
+                    quads ::= new Quad(extractionContext, DBpediaDatasets.Infoboxes, subjectUri, usesTemplateProperty, "http://dbpedia.org/resource/"; + templateNamespace + ":" + template.title.encoded, template.sourceUri, null)
                     seenTemplates.add(template.title.encoded)
                 }
             }
Index: core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala
===================================================================
--- core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala	(revision 3884)
+++ core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala	(revision )
@@ -32,6 +32,17 @@
         "en" -> "st|nd|rd|th",
         "el" -> "η|ης"
     )
+    // -1 is for BC
+    //TODO matches anything e.g. 20 bd
+    private val eraStr =  Map(
+        "en" -> Map("BCE" -> 1, "BC" -> (-1), "CE"-> 1, "AD"-> 1, "AC"-> (-1), "CE"-> 1),
+        "el" -> Map("ΠΧ"-> (-1), "Π\\.Χ\\."-> (-1), "Π\\.Χ"-> (-1) , "ΜΧ"-> 1 , "Μ\\.Χ\\."-> 1, "Μ\\.Χ"-> 1)
+    )
+
+    private val monthRegex = months(language).keySet.mkString("|")
+    private val cardinalityRegex = if (cardinality.contains(language)) cardinality(language) else cardinality("en")
+    private val eraRegex = if (eraStr.contains(language)) eraStr(language).keySet.mkString("|") else eraStr("en").keySet.mkString("|")
+
     private val logger = Logger.getLogger(classOf[UnitValueParser].getName)
 
     private val prefix = if(strict) """\s*""" else """.*?"""
@@ -39,30 +50,31 @@
     private val postfix = if(strict) """\s*""" else ".*"
 
     // catch dates like: "8 June 07" or "07 June 45"
-    private val DateRegex1 = ("""(?iu)""" + prefix + """([0-9]{1,2})\s*("""+months(language).keySet.mkString("|")+""")\s*([0-9]{2})(?!\d).*""" + postfix).r
+    private val DateRegex1 = ("""(?iu)""" + prefix + """([0-9]{1,2})\s*("""+monthRegex+""")\s*([0-9]{2})(?!\d).*""" + postfix).r
 
     // catch dates like: "[[29 January]] [[300 AD]]", "[[23 June]] [[2008]] (UTC)", "09:32, 6 March 2000 (UTC)" or "3 June 1981"
-    private val DateRegex2 =("""(?iu)""" + prefix + """(?<!\d)\[?\[?([0-9]{1,2})(\.|""" + cardinality.get(language).getOrElse("") + """)?\s*("""+months(language).keySet.mkString("|")+""")\]?\]?,? \[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?\]?\]?(?!\d)""" + postfix).r
+    private val DateRegex2 = ("""(?iu)""" + prefix + """(?<!\d)\[?\[?([0-9]{1,2})(\.|""" + cardinalityRegex + """)?\s*("""+monthRegex+""")\]?\]?,? \[?\[?([0-9]{1,4})\s*(""" + eraRegex + """)?\]?\]?(?!\d)""" + postfix).r
 
     // catch dates like: "[[January 20]] [[1995 AD]]", "[[June 17]] [[2008]] (UTC)" or "January 20 1995"
-    private val DateRegex3 = ("""(?iu)""" + prefix + """\[?\[?("""+months(language).keySet.mkString("|")+""")\s*,?\s+([0-9]{1,2})\]?\]?\s*[.,]?\s+\[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?\]?\]?""" + postfix).r
+    private val DateRegex3 = ("""(?iu)""" + prefix + """\[?\[?("""+monthRegex+""")\s*,?\s+([0-9]{1,2})\]?\]?\s*[.,]?\s+\[?\[?([0-9]{1,4})\s*(""" + eraRegex + """)?\]?\]?""" + postfix).r
 
     // catch dates like: "24-06-1867", "24/06/1867" or "bla24-06-1867bla"
     private val DateRegex4 = ("""(?iu)""" + prefix + """(?<!\d)([0-9]{1,2}+)[-/]([0-9]{1,2}+)[-/]([0-9]{3,4}+)(?!\d)""" + postfix).r
 
     // catch dates like: "24-june-1867", "24/avril/1867" or "bla24|juillet|1867bla"
-    private val DateRegex5 = ("""(?iu)""" + prefix + """(?<!\d)([0-9]{1,2}+)[-/\|]("""+months(language).keySet.mkString("|")+""")[-/\|]([0-9]{3,4}+)(?!\d)""" + postfix).r
+    private val DateRegex5 = ("""(?iu)""" + prefix + """(?<!\d)([0-9]{1,2}+)[-/\|](""" + monthRegex + """)[-/\|]([0-9]{3,4}+)(?!\d)""" + postfix).r
 
     // catch dates like: "1990 06 24", "1990-06-24", "1990/06/24" or "1977-01-01 00:00:00.000000"
     private val DateRegex6 = ("""(?iu)""" + prefix + """(?<!\d)([0-9]{3,4})[-/\s]([0-9]{1,2})[-/\s]([0-9]{1,2})(?!\d).*""").r
 
-    private val DayMonthRegex1 = ("""(?iu)""" + prefix + """("""+months(language).keySet.mkString("|")+""")\]?\]?\s*\[?\[?([1-9]|0[1-9]|[12][0-9]|3[01])(?!\d)""" + postfix).r
+    private val DayMonthRegex1 = ("""(?iu)""" + prefix + """("""+monthRegex+""")\]?\]?\s*\[?\[?([1-9]|0[1-9]|[12][0-9]|3[01])(?!\d)""" + postfix).r
 
-    private val DayMonthRegex2 = ("""(?iu)""" + prefix + """(?<!\d)([1-9]|0[1-9]|[12][0-9]|3[01])\s*(""" + cardinality.get(language).getOrElse("") + """)?\]?\]?\s*(of)?\s*\[?\[?("""+months(language).keySet.mkString("|")+""")\]?\]?""" + postfix).r
+    private val DayMonthRegex2 = ("""(?iu)""" + prefix + """(?<!\d)([1-9]|0[1-9]|[12][0-9]|3[01])\s*(""" + cardinalityRegex + """)?\]?\]?\s*(of)?\s*\[?\[?("""+monthRegex+""")\]?\]?""" + postfix).r
 
-    private val MonthYearRegex = ("""(?iu)""" + prefix + """("""+months(language).keySet.mkString("|")+""")\]?\]?,?\s*\[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?""" + postfix).r
+    private val MonthYearRegex = ("""(?iu)""" + prefix + """("""+monthRegex+""")\]?\]?,?\s*\[?\[?([0-9]{1,4})\s*(""" + eraRegex + """)?""" + postfix).r
 
-    private val YearRegexes = for(i <- (1 to 4).reverse) yield (prefix + """(?<![\d\pL\w])(\d{""" + i + """})(?!\d)\s*(BCE|BC|CE|AD|AC|CE)?""" + postfix).r
+    //added case insensitive match
+    private val YearRegexes = for(i <- (1 to 4).reverse) yield ("""(?iu)""" + prefix + """(?<![\d\pL\w])(\d{""" + i + """})(?!\d)\s*(""" + eraRegex + """)?""" + postfix).r
 
    	override def parse(node : Node) : Option[Date] =
     {
@@ -101,82 +113,110 @@
 		val childrenChilds = for(child <- node.children) yield
             { for(childrenChild @ TextNode(_, _)<- child.children) yield childrenChild }
 
+        if (language == "en")
+        {
-        if (templateName == "Birth-date")
-        {
-            for (property <- node.property("1");
-                 TextNode(text, _) <- property.children)
-            {
-                return findDate(text)
-            }
-        }
-        // http://en.wikipedia.org/wiki/Template:Birth_date_and_age
-        // {{Birth date|year_of_birth|month_of_birth|day_of_birth|...}}
-        // Sometimes the templates are used wrong like this:
-        // {{birth date|df=yes|1833|10|21}}
-        // TODO: fix problem with gYear gDate e.q. Alfred Nobel
-        else if (templateName == "Birth date and age" || templateName == "Birth date and age2" ||
-            templateName == "Death date and age" || templateName == "Birth date" ||
-            templateName == "Death date" || templateName == "Bda" || templateName == "Dob")
-        {
-            for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
-                year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
-            {
-                try
-                {
-                    return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
-                }
-                catch
-                {
-                    case e : IllegalArgumentException =>    
-                }
-            }
-        }
-        // http://en.wikipedia.org/wiki/Template:BirthDeathAge
-        // {{BirthDeathAge|birth_or_death_flag|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
-        else if (templateName == "Birth Death Age")
-        {
-            // gets the text from the single textNode of the first PropertyNode
-            // {{BirthDeathAge|BIRTH_OR_DEATH_FLAG|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
-            for (property <- node.property("1")) property.retrieveText match
-		    {
-                case Some("B") =>
-                {
-                    for (yearProperty <- node.property("2"); monthProperty <- node.property("3"); dayProperty <- node.property("4");
-                        year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                        month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                        day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
-                    {
-                        try
-                        {
-                            return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
-                        }
-                        catch
-                        {
-                            case e : IllegalArgumentException =>
-                        }
-                    }
-                }
-                case _ =>
-                {
-                    for (yearProperty <- node.property("5"); monthProperty <- node.property("6"); dayProperty <- node.property("7");
-                        year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                        month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                        day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
-                    {
-                        try
-                        {
-                            return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
-                        }
-                        catch
-                        {
-                            case e : IllegalArgumentException =>
-                        }
-                    }
-                }
-		    }
-        }
+            if (templateName == "Birth-date")
+            {
+                for (property <- node.property("1");
+                     TextNode(text, _) <- property.children)
+                {
+                    return findDate(text)
+                }
+            }
+            // http://en.wikipedia.org/wiki/Template:Birth_date_and_age
+            // {{Birth date|year_of_birth|month_of_birth|day_of_birth|...}}
+            // Sometimes the templates are used wrong like this:
+            // {{birth date|df=yes|1833|10|21}}
+            // TODO: fix problem with gYear gDate e.q. Alfred Nobel
+            else if (templateName == "Birth date and age" || templateName == "Birth date and age2" ||
+                templateName == "Death date and age" || templateName == "Birth date" ||
+                templateName == "Death date" || templateName == "Bda" || templateName == "Dob")
+            {
+                for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
+                    year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                    month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                    day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+                {
+                    try
+                    {
+                        return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+                    }
+                    catch
+                    {
+                        case e : IllegalArgumentException =>
+                    }
+                }
+            }
+            // http://en.wikipedia.org/wiki/Template:BirthDeathAge
+            // {{BirthDeathAge|birth_or_death_flag|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
+            else if (templateName == "Birth Death Age")
+            {
+                // gets the text from the single textNode of the first PropertyNode
+                // {{BirthDeathAge|BIRTH_OR_DEATH_FLAG|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
+                for (property <- node.property("1")) property.retrieveText match
+                {
+                    case Some("B") =>
+                    {
+                        for (yearProperty <- node.property("2"); monthProperty <- node.property("3"); dayProperty <- node.property("4");
+                            year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                            month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                            day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+                        {
+                            try
+                            {
+                                return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+                            }
+                            catch
+                            {
+                                case e : IllegalArgumentException =>
+                            }
+                        }
+                    }
+                    case _ =>
+                    {
+                        for (yearProperty <- node.property("5"); monthProperty <- node.property("6"); dayProperty <- node.property("7");
+                            year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                            month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                            day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+                        {
+                            try
+                            {
+                                return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+                            }
+                            catch
+                            {
+                                case e : IllegalArgumentException =>
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if (language == "el")
+        {
+            //birth_year|birth_month|birth_day}} //same, parse the first 3 each time
+            //death_year|death_month|death_dat|birth_year|birth_month|birth_day}}
+            if (templateName.toLowerCase == "ηγη"  || templateName.toLowerCase == "ημερομηνία γέννησης και ηλικία" ||
+                templateName.toLowerCase == "ηθηλ" || templateName.toLowerCase == "ημερομηνία θανάτου και ηλικία" ||
+                templateName.toLowerCase == "ημερομηνία γέννησης")
+            {
+                for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
+                    year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                    month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                    day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+                {
+                    try
+                    {
+                        return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+                    }
+                    catch
+                    {
+                        case e : IllegalArgumentException =>
+                    }
+                }
+
+            }
+        }
        	logger.log(Level.FINE, "Template unknown: " + node.title);
         return None
     }
@@ -254,11 +294,7 @@
 
         for(DateRegex2(day, dunno, month, year, era) <- List(input))
         {
-            var eraIdentifier = ""
-            if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
-            {
-                eraIdentifier = "-"
-            }
+            var eraIdentifier = getEraSign(era)
             try
             {
                 val monthNumber = months(language)(month.toLowerCase())
@@ -272,11 +308,7 @@
 
         for(DateRegex3(month, day, year, era) <- List(input))
         {
-            var eraIdentifier = ""
-            if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
-            {
-                eraIdentifier = "-"
-            }
+            var eraIdentifier = getEraSign(era)
             try
             {
                 val monthNumber = months(language)(month.toLowerCase())
@@ -355,11 +387,7 @@
     		val month = result.group(1)
     		val year = result.group(2)
     		val era = result.group(3)
-        	var eraIdentifier = ""
-        	if ((era != null) && ((era.substring(0,2).toUpperCase == "BC" || era.substring(0,2).toUpperCase == "AC")))
-        	{
-       			eraIdentifier = "-"
-        	}
+        var eraIdentifier = getEraSign(era)
 	    	try
 	    	{
 	    		val monthNumber = months(language)(month.toLowerCase())
@@ -381,13 +409,8 @@
             {
                 case yearRegex(year, era) =>                 
                 {
-                    var eraIdentifier = ""
+                    var eraIdentifier = getEraSign(era)
-                
+                    
-                    if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
-                    {
-                        eraIdentifier = "-"
-                    }
-                    
                     return new Some(new Date(year = Some((eraIdentifier+year).toInt), datatype = datatype))
                 }
                 case _ =>
@@ -402,4 +425,20 @@
         case TextNode(text, _) => text
         case _ => node.children.map(nodeToString).mkString
     }
+
+    private def getEraSign(input : String) : String =
+    {
+        if (input == null) return ""
+
+        // '.' is used in regex as '\\.'
+        val tmpInp = input.replace(".", "\\.").toLowerCase
+        val tmpMap = if (eraStr.contains(language)) eraStr(language) else eraStr("en")
+        for ( (key, value) <- tmpMap
+              if value==(-1))
+        {
+            if (key.toLowerCase == tmpInp.substring(0,Math.min(key.size,tmpInp.size)) )
+                return "-"
-}
\ No newline at end of file
+        }
+        return ""
+    }
+}
\ No newline at end of file
------------------------------------------------------------------------------
Protect Your Site and Customers from Malware Attacks
Learn about various malware tactics and how to avoid them. Understand 
malware threats, the impact they can have on your business, and how you 
can protect your company and customers by using code signing.
http://p.sf.net/sfu/oracle-sfdevnl
_______________________________________________
Dbpedia-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/dbpedia-discussion

Reply via email to