another patch for date time parser

adds multilingual support for era (BC) parsing
and greek template dates
all tests are working, except for those that didn't :)

Merry Christmas to all!!!

cheers,
Dimitris
Index: core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala
===================================================================
--- core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala	(revision 3872)
+++ core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala	(revision )
@@ -32,6 +32,12 @@
         "en" -> "st|nd|rd|th",
         "el" -> "η|ης"
     )
+    // -1 is for BC
+    //TODO matches anything e.g. 20 bd
+    private val eraStr =  Map(
+        "en" -> Map("BCE" -> 1, "BC" -> (-1), "CE"-> 1, "AD"-> 1, "AC"-> (-1), "CE"-> 1),
+        "el" -> Map("ΠΧ"-> (-1), "Π\\.Χ\\."-> (-1), "Π\\.Χ"-> (-1) , "ΜΧ"-> 1 , "Μ\\.Χ\\."-> 1, "Μ\\.Χ"-> 1)
+    )
     private val logger = Logger.getLogger(classOf[UnitValueParser].getName)
 
     private val prefix = if(strict) """\s*""" else """.*?"""
@@ -42,10 +48,10 @@
     private val DateRegex1 = ("""(?iu)""" + prefix + """([0-9]{1,2})\s*("""+months(language).keySet.mkString("|")+""")\s*([0-9]{2})(?!\d).*""" + postfix).r
 
     // catch dates like: "[[29 January]] [[300 AD]]", "[[23 June]] [[2008]] (UTC)", "09:32, 6 March 2000 (UTC)" or "3 June 1981"
-    private val DateRegex2 =("""(?iu)""" + prefix + """(?<!\d)\[?\[?([0-9]{1,2})(\.|""" + cardinality(language) + """)?\s*("""+months(language).keySet.mkString("|")+""")\]?\]?,? \[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?\]?\]?(?!\d)""" + postfix).r
+    private val DateRegex2 =("""(?iu)""" + prefix + """(?<!\d)\[?\[?([0-9]{1,2})(\.|""" + cardinality(language) + """)?\s*("""+months(language).keySet.mkString("|")+""")\]?\]?,? \[?\[?([0-9]{1,4})\s*(""" + eraStr(language).keySet.mkString("|") + """)?\]?\]?(?!\d)""" + postfix).r
 
     // catch dates like: "[[January 20]] [[1995 AD]]", "[[June 17]] [[2008]] (UTC)" or "January 20 1995"
-    private val DateRegex3 = ("""(?iu)""" + prefix + """\[?\[?("""+months(language).keySet.mkString("|")+""")\s*,?\s+([0-9]{1,2})\]?\]?\s*[.,]?\s+\[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?\]?\]?""" + postfix).r
+    private val DateRegex3 = ("""(?iu)""" + prefix + """\[?\[?("""+months(language).keySet.mkString("|")+""")\s*,?\s+([0-9]{1,2})\]?\]?\s*[.,]?\s+\[?\[?([0-9]{1,4})\s*(""" + eraStr(language).keySet.mkString("|") + """)?\]?\]?""" + postfix).r
 
     // catch dates like: "24-06-1867", "24/06/1867" or "bla24-06-1867bla"
     private val DateRegex4 = ("""(?iu)""" + prefix + """(?<!\d)([0-9]{1,2}+)[-/]([0-9]{1,2}+)[-/]([0-9]{3,4}+)(?!\d)""" + postfix).r
@@ -60,9 +66,10 @@
 
     private val DayMonthRegex2 = ("""(?iu)""" + prefix + """(?<!\d)([1-9]|0[1-9]|[12][0-9]|3[01])\s*(""" + cardinality(language) + """)?\]?\]?\s*(of)?\s*\[?\[?("""+months(language).keySet.mkString("|")+""")\]?\]?""" + postfix).r
 
-    private val MonthYearRegex = ("""(?iu)""" + prefix + """("""+months(language).keySet.mkString("|")+""")\]?\]?,?\s*\[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?""" + postfix).r
+    private val MonthYearRegex = ("""(?iu)""" + prefix + """("""+months(language).keySet.mkString("|")+""")\]?\]?,?\s*\[?\[?([0-9]{1,4})\s*(""" + eraStr(language).keySet.mkString("|") + """)?""" + postfix).r
 
-    private val YearRegexes = for(i <- (1 to 4).reverse) yield (prefix + """(?<![\d\pL\w])(\d{""" + i + """})(?!\d)\s*(BCE|BC|CE|AD|AC|CE)?""" + postfix).r
+    //added case insensitive match
+    private val YearRegexes = for(i <- (1 to 4).reverse) yield ("""(?iu)""" + prefix + """(?<![\d\pL\w])(\d{""" + i + """})(?!\d)\s*(""" + eraStr(language).keySet.mkString("|") + """)?""" + postfix).r
 
    	override def parse(node : Node) : Option[Date] =
     {
@@ -96,87 +103,115 @@
 
     private def catchTemplate(node: TemplateNode) : Option[Date] =
     {
-    	val templateName = extractionContext.redirects.resolve(node.title).decoded
+    	  val templateName = extractionContext.redirects.resolve(node.title).decoded
 
-		val childrenChilds = for(child <- node.children) yield
+		    val childrenChilds = for(child <- node.children) yield
             { for(childrenChild @ TextNode(_, _)<- child.children) yield childrenChild }
 
+        if (language == "en")
+        {
-        if (templateName == "Birth-date")
-        {
-            for (property <- node.property("1");
-                 TextNode(text, _) <- property.children)
-            {
-                return findDate(text)
-            }
-        }
-        // http://en.wikipedia.org/wiki/Template:Birth_date_and_age
-        // {{Birth date|year_of_birth|month_of_birth|day_of_birth|...}}
-        // Sometimes the templates are used wrong like this:
-        // {{birth date|df=yes|1833|10|21}}
-        // TODO: fix problem with gYear gDate e.q. Alfred Nobel
-        else if (templateName == "Birth date and age" || templateName == "Birth date and age2" ||
-            templateName == "Death date and age" || templateName == "Birth date" ||
-            templateName == "Death date" || templateName == "Bda" || templateName == "Dob")
-        {
-            for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
-                year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
-            {
-                try
-                {
-                    return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
-                }
-                catch
-                {
-                    case e : IllegalArgumentException =>    
-                }
-            }
-        }
-        // http://en.wikipedia.org/wiki/Template:BirthDeathAge
-        // {{BirthDeathAge|birth_or_death_flag|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
-        else if (templateName == "Birth Death Age")
-        {
-            // gets the text from the single textNode of the first PropertyNode
-            // {{BirthDeathAge|BIRTH_OR_DEATH_FLAG|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
-            for (property <- node.property("1")) property.retrieveText match
-		    {
-                case Some("B") =>
-                {
-                    for (yearProperty <- node.property("2"); monthProperty <- node.property("3"); dayProperty <- node.property("4");
-                        year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                        month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                        day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
-                    {
-                        try
-                        {
-                            return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
-                        }
-                        catch
-                        {
-                            case e : IllegalArgumentException =>
-                        }
-                    }
-                }
-                case _ =>
-                {
-                    for (yearProperty <- node.property("5"); monthProperty <- node.property("6"); dayProperty <- node.property("7");
-                        year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                        month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
-                        day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
-                    {
-                        try
-                        {
-                            return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
-                        }
-                        catch
-                        {
-                            case e : IllegalArgumentException =>
-                        }
-                    }
-                }
-		    }
-        }
+            if (templateName == "Birth-date")
+            {
+                for (property <- node.property("1");
+                     TextNode(text, _) <- property.children)
+                {
+                    return findDate(text)
+                }
+            }
+            // http://en.wikipedia.org/wiki/Template:Birth_date_and_age
+            // {{Birth date|year_of_birth|month_of_birth|day_of_birth|...}}
+            // Sometimes the templates are used wrong like this:
+            // {{birth date|df=yes|1833|10|21}}
+            // TODO: fix problem with gYear gDate e.q. Alfred Nobel
+            else if (templateName == "Birth date and age" || templateName == "Birth date and age2" ||
+                templateName == "Death date and age" || templateName == "Birth date" ||
+                templateName == "Death date" || templateName == "Bda" || templateName == "Dob")
+            {
+                for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
+                    year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                    month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                    day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+                {
+                    try
+                    {
+                        return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+                    }
+                    catch
+                    {
+                        case e : IllegalArgumentException =>
+                    }
+                }
+            }
+            // http://en.wikipedia.org/wiki/Template:BirthDeathAge
+            // {{BirthDeathAge|birth_or_death_flag|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
+            else if (templateName == "Birth Death Age")
+            {
+                // gets the text from the single textNode of the first PropertyNode
+                // {{BirthDeathAge|BIRTH_OR_DEATH_FLAG|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
+                for (property <- node.property("1")) property.retrieveText match
+                {
+                    case Some("B") =>
+                    {
+                        for (yearProperty <- node.property("2"); monthProperty <- node.property("3"); dayProperty <- node.property("4");
+                            year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                            month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                            day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+                        {
+                            try
+                            {
+                                return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+                            }
+                            catch
+                            {
+                                case e : IllegalArgumentException =>
+                            }
+                        }
+                    }
+                    case _ =>
+                    {
+                        for (yearProperty <- node.property("5"); monthProperty <- node.property("6"); dayProperty <- node.property("7");
+                            year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                            month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                            day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+                        {
+                            try
+                            {
+                                return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+                            }
+                            catch
+                            {
+                                case e : IllegalArgumentException =>
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if (language == "el")
+        {
+            //birth_year|birth_month|birth_day}} //same, parse the first 3 each time
+            //death_year|death_month|death_dat|birth_year|birth_month|birth_day}}
+            if (templateName.toLowerCase == "ηγη"  || templateName.toLowerCase == "ημερομηνία γέννησης και ηλικία" ||
+                templateName.toLowerCase == "ηθηλ" || templateName.toLowerCase == "ημερομηνία θανάτου και ηλικία" ||
+                templateName.toLowerCase == "ημερομηνία γέννησης")
+            {
+                for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
+                    year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                    month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+                    day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+                {
+                    try
+                    {
+                        return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+                    }
+                    catch
+                    {
+                        case e : IllegalArgumentException =>
+                    }
+                }
+
+            }
+        }
        	logger.log(Level.FINE, "Template unknown: " + node.title);
         return None
     }
@@ -254,11 +289,7 @@
 
         for(DateRegex2(day, dunno, month, year, era) <- List(input))
         {
-            var eraIdentifier = ""
-            if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
-            {
-                eraIdentifier = "-"
-            }
+            var eraIdentifier = getEraSign(era)
             try
             {
                 val monthNumber = months(language)(month.toLowerCase())
@@ -272,11 +303,7 @@
 
         for(DateRegex3(month, day, year, era) <- List(input))
         {
-            var eraIdentifier = ""
-            if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
-            {
-                eraIdentifier = "-"
-            }
+            var eraIdentifier = getEraSign(era)
             try
             {
                 val monthNumber = months(language)(month.toLowerCase())
@@ -355,11 +382,7 @@
     		val month = result.group(1)
     		val year = result.group(2)
     		val era = result.group(3)
-        	var eraIdentifier = ""
-        	if ((era != null) && ((era.substring(0,2).toUpperCase == "BC" || era.substring(0,2).toUpperCase == "AC")))
-        	{
-       			eraIdentifier = "-"
-        	}
+        var eraIdentifier = getEraSign(era)
 	    	try
 	    	{
 	    		val monthNumber = months(language)(month.toLowerCase())
@@ -381,13 +404,8 @@
             {
                 case yearRegex(year, era) =>                 
                 {
-                    var eraIdentifier = ""
+                    var eraIdentifier = getEraSign(era)
-                
+                    
-                    if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
-                    {
-                        eraIdentifier = "-"
-                    }
-                    
                     return new Some(new Date(year = Some((eraIdentifier+year).toInt), datatype = datatype))
                 }
                 case _ =>
@@ -402,4 +420,19 @@
         case TextNode(text, _) => text
         case _ => node.children.map(nodeToString).mkString
     }
+
+    private def getEraSign(input : String) : String =
+    {
+        if (input == null) return ""
+
+        // '.' is used in regex as '\\.'
+        val tmpInp = input.replace(".", "\\.").toLowerCase
+        for ( (key, value) <- eraStr(language)
+              if value==(-1))
+        {
+            if (key.toLowerCase == tmpInp.substring(0,Math.min(key.size,tmpInp.size)) )
+                return "-"
-}
\ No newline at end of file
+        }
+        return ""
+    }
+}
\ No newline at end of file
Index: core/src/test/scala/org/dbpedia/extraction/dataparser/DateTimeParserTest.scala
===================================================================
--- core/src/test/scala/org/dbpedia/extraction/dataparser/DateTimeParserTest.scala	(revision 3676)
+++ core/src/test/scala/org/dbpedia/extraction/dataparser/DateTimeParserTest.scala	(revision )
@@ -435,6 +435,85 @@
         parse("en", "xsd:date", "grr10/10/2007bla") should equal (Some("2007-10-10"))
     }
 
+    //greek date tests
+
+    "DataParser" should "return date (02 Μαρτίου 151)" in
+    {
+        parse("el", "xsd:date", "02 Μαρτίου 151") should equal (Some("0151-03-02"))
+    }
+    "DataParser" should "return gYear (20 π.Χ.)" in
+    {
+        parse("el", "xsd:gYear", "20 π.Χ.") should equal (Some("-0020"))
+    }
+    "DataParser" should "return gYear (20 πΧ)" in
+    {
+        parse("el", "xsd:gYear", "20 πΧ") should equal (Some("-0020"))
+    }
+    "DataParser" should "return gYear (20 Π.Χ.)" in
+    {
+        parse("el", "xsd:gYear", "20 Π.Χ.") should equal (Some("-0020"))
+    }
+    "DataParser" should "return gYear (20 ΠΧ)" in
+    {
+        parse("el", "xsd:gYear", "20 ΠΧ") should equal (Some("-0020"))
+    }
+    "DataParser" should "return gYear (20 μ.Χ.)" in
+    {
+        parse("el", "xsd:gYear", "20 μ.Χ.") should equal (Some ("0020"))
+    }
+    "DataParser" should "return gYear (20 μΧ)" in
+    {
+        parse("el", "xsd:gYear", "20 μΧ") should equal (Some ("0020"))
+    }
+    /*"DataParser" should "return gYear (14ος αιώνας)" in
+    {
+        parse("el", "xsd:gYear", "14ος αιώνας") should equal (Some("1300"))
+    }*/
+    "DataParser" should "return gMonthDay (4η ιουλίου)" in
+    {
+        parse("el", "xsd:gMonthDay", "4η ιουλίου") should equal (Some("--07-04"))
+    }
+    "DataParser" should "return gYearMonth (σεπτέμβριος 2007)" in
+    {
+        parse("el", "xsd:gYearMonth", "σεπτέμβριος 2007") should equal (Some("2007-09"))
+    }
+    "DataParser" should "return gYearMonth (1[[429 ιανουαρίου]] [[300 μ.Χ.]])" in
+    {
+        parse("el", "xsd:gYearMonth", "1[[429 ιανουαρίου]] [[300 μ.Χ.]]") should equal (Some("0300-01"))
+    }
+    "DataParser" should "return date (ιούνιος, 21 2007 π.Χ.)" in
+    {
+        parse("el", "xsd:date", "ιούνιος, 21 2007 π.Χ.") should equal (Some("-2007-06-21"))
+    }
+    "DataParser" should "return date (1η δεκεμβρίου 2006)" in
+    {
+        parse("el", "xsd:date", "1η δεκεμβρίου 2006") should equal (Some("2006-12-01"))
+    }
+    "DataParser" should "return date ([[1η μαΐου]] [[2006]])" in
+    {
+        parse("el", "xsd:date", "[[1η μαΐου]] [[2006]]") should equal (Some("2006-05-01"))
+    }
+    "DataParser" should "return date (12 ιουνίου 2008)" in
+    {
+        parse("el", "xsd:date", "12 ιουνίου 2008") should equal (Some("2008-06-12"))
+    }
+    "DataParser" should "return date (12 ιούνιος 2008)" in
+    {
+        parse("el", "xsd:date", "12 ιούνιος 2008") should equal (Some("2008-06-12"))
+    }
+    "DataParser" should "return date ([[2 Νοεμβρίου]] [[1911]])" in
+    {
+        parse("el", "xsd:date", "[[2 Νοεμβρίου]] [[1911]]") should equal (Some("1911-11-02"))
+    }
+    "DataParser" should "return date ({{ηθηλ|1996|03|18|1911|11|2}})" in
+    {
+        parse("el", "xsd:date", "{{ηθηλ|1996|03|18|1911|11|2}}") should equal (Some("1996-03-18"))
+    }
+    "DataParser" should "return date ({{ηγη|1996|03|18}})" in
+    {
+        parse("el", "xsd:date", "{{ηθηλ|1996|03|18}}") should equal (Some("1996-03-18"))
+    }
+
     private val wikiParser = WikiParser()
 
     private def parse(language : String, datatype : String, input : String) : Option[String] =
@@ -447,4 +526,4 @@
 
         dateParser.parse(wikiParser(page)).map(_.toString)
     }
-}
\ No newline at end of file
+}
------------------------------------------------------------------------------
Forrester recently released a report on the Return on Investment (ROI) of
Google Apps. They found a 300% ROI, 38%-56% cost savings, and break-even
within 7 months.  Over 3 million businesses have gone Google with Google Apps:
an online email calendar, and document program that's accessible from your 
browser. Read the Forrester report: http://p.sf.net/sfu/googleapps-sfnew
_______________________________________________
Dbpedia-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/dbpedia-discussion

Reply via email to