another patch for date time parser
adds multilingual support for era (BC) parsing
and greek template dates
all tests are working, except for those that didn't :)
Merry Christmas to all!!!
cheers,
Dimitris
Index: core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala
===================================================================
--- core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala (revision 3872)
+++ core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala (revision )
@@ -32,6 +32,12 @@
"en" -> "st|nd|rd|th",
"el" -> "η|ης"
)
+ // -1 is for BC
+ //TODO matches anything e.g. 20 bd
+ private val eraStr = Map(
+ "en" -> Map("BCE" -> 1, "BC" -> (-1), "CE"-> 1, "AD"-> 1, "AC"-> (-1), "CE"-> 1),
+ "el" -> Map("ΠΧ"-> (-1), "Π\\.Χ\\."-> (-1), "Π\\.Χ"-> (-1) , "ΜΧ"-> 1 , "Μ\\.Χ\\."-> 1, "Μ\\.Χ"-> 1)
+ )
private val logger = Logger.getLogger(classOf[UnitValueParser].getName)
private val prefix = if(strict) """\s*""" else """.*?"""
@@ -42,10 +48,10 @@
private val DateRegex1 = ("""(?iu)""" + prefix + """([0-9]{1,2})\s*("""+months(language).keySet.mkString("|")+""")\s*([0-9]{2})(?!\d).*""" + postfix).r
// catch dates like: "[[29 January]] [[300 AD]]", "[[23 June]] [[2008]] (UTC)", "09:32, 6 March 2000 (UTC)" or "3 June 1981"
- private val DateRegex2 =("""(?iu)""" + prefix + """(?<!\d)\[?\[?([0-9]{1,2})(\.|""" + cardinality(language) + """)?\s*("""+months(language).keySet.mkString("|")+""")\]?\]?,? \[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?\]?\]?(?!\d)""" + postfix).r
+ private val DateRegex2 =("""(?iu)""" + prefix + """(?<!\d)\[?\[?([0-9]{1,2})(\.|""" + cardinality(language) + """)?\s*("""+months(language).keySet.mkString("|")+""")\]?\]?,? \[?\[?([0-9]{1,4})\s*(""" + eraStr(language).keySet.mkString("|") + """)?\]?\]?(?!\d)""" + postfix).r
// catch dates like: "[[January 20]] [[1995 AD]]", "[[June 17]] [[2008]] (UTC)" or "January 20 1995"
- private val DateRegex3 = ("""(?iu)""" + prefix + """\[?\[?("""+months(language).keySet.mkString("|")+""")\s*,?\s+([0-9]{1,2})\]?\]?\s*[.,]?\s+\[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?\]?\]?""" + postfix).r
+ private val DateRegex3 = ("""(?iu)""" + prefix + """\[?\[?("""+months(language).keySet.mkString("|")+""")\s*,?\s+([0-9]{1,2})\]?\]?\s*[.,]?\s+\[?\[?([0-9]{1,4})\s*(""" + eraStr(language).keySet.mkString("|") + """)?\]?\]?""" + postfix).r
// catch dates like: "24-06-1867", "24/06/1867" or "bla24-06-1867bla"
private val DateRegex4 = ("""(?iu)""" + prefix + """(?<!\d)([0-9]{1,2}+)[-/]([0-9]{1,2}+)[-/]([0-9]{3,4}+)(?!\d)""" + postfix).r
@@ -60,9 +66,10 @@
private val DayMonthRegex2 = ("""(?iu)""" + prefix + """(?<!\d)([1-9]|0[1-9]|[12][0-9]|3[01])\s*(""" + cardinality(language) + """)?\]?\]?\s*(of)?\s*\[?\[?("""+months(language).keySet.mkString("|")+""")\]?\]?""" + postfix).r
- private val MonthYearRegex = ("""(?iu)""" + prefix + """("""+months(language).keySet.mkString("|")+""")\]?\]?,?\s*\[?\[?([0-9]{1,4})\s*(BCE|BC|CE|AD|AC|CE)?""" + postfix).r
+ private val MonthYearRegex = ("""(?iu)""" + prefix + """("""+months(language).keySet.mkString("|")+""")\]?\]?,?\s*\[?\[?([0-9]{1,4})\s*(""" + eraStr(language).keySet.mkString("|") + """)?""" + postfix).r
- private val YearRegexes = for(i <- (1 to 4).reverse) yield (prefix + """(?<![\d\pL\w])(\d{""" + i + """})(?!\d)\s*(BCE|BC|CE|AD|AC|CE)?""" + postfix).r
+ //added case insensitive match
+ private val YearRegexes = for(i <- (1 to 4).reverse) yield ("""(?iu)""" + prefix + """(?<![\d\pL\w])(\d{""" + i + """})(?!\d)\s*(""" + eraStr(language).keySet.mkString("|") + """)?""" + postfix).r
override def parse(node : Node) : Option[Date] =
{
@@ -96,87 +103,115 @@
private def catchTemplate(node: TemplateNode) : Option[Date] =
{
- val templateName = extractionContext.redirects.resolve(node.title).decoded
+ val templateName = extractionContext.redirects.resolve(node.title).decoded
- val childrenChilds = for(child <- node.children) yield
+ val childrenChilds = for(child <- node.children) yield
{ for(childrenChild @ TextNode(_, _)<- child.children) yield childrenChild }
+ if (language == "en")
+ {
- if (templateName == "Birth-date")
- {
- for (property <- node.property("1");
- TextNode(text, _) <- property.children)
- {
- return findDate(text)
- }
- }
- // http://en.wikipedia.org/wiki/Template:Birth_date_and_age
- // {{Birth date|year_of_birth|month_of_birth|day_of_birth|...}}
- // Sometimes the templates are used wrong like this:
- // {{birth date|df=yes|1833|10|21}}
- // TODO: fix problem with gYear gDate e.q. Alfred Nobel
- else if (templateName == "Birth date and age" || templateName == "Birth date and age2" ||
- templateName == "Death date and age" || templateName == "Birth date" ||
- templateName == "Death date" || templateName == "Bda" || templateName == "Dob")
- {
- for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
- year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
- month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
- day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
- {
- try
- {
- return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
- }
- catch
- {
- case e : IllegalArgumentException =>
- }
- }
- }
- // http://en.wikipedia.org/wiki/Template:BirthDeathAge
- // {{BirthDeathAge|birth_or_death_flag|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
- else if (templateName == "Birth Death Age")
- {
- // gets the text from the single textNode of the first PropertyNode
- // {{BirthDeathAge|BIRTH_OR_DEATH_FLAG|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
- for (property <- node.property("1")) property.retrieveText match
- {
- case Some("B") =>
- {
- for (yearProperty <- node.property("2"); monthProperty <- node.property("3"); dayProperty <- node.property("4");
- year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
- month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
- day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
- {
- try
- {
- return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
- }
- catch
- {
- case e : IllegalArgumentException =>
- }
- }
- }
- case _ =>
- {
- for (yearProperty <- node.property("5"); monthProperty <- node.property("6"); dayProperty <- node.property("7");
- year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
- month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
- day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
- {
- try
- {
- return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
- }
- catch
- {
- case e : IllegalArgumentException =>
- }
- }
- }
- }
- }
+ if (templateName == "Birth-date")
+ {
+ for (property <- node.property("1");
+ TextNode(text, _) <- property.children)
+ {
+ return findDate(text)
+ }
+ }
+ // http://en.wikipedia.org/wiki/Template:Birth_date_and_age
+ // {{Birth date|year_of_birth|month_of_birth|day_of_birth|...}}
+ // Sometimes the templates are used wrong like this:
+ // {{birth date|df=yes|1833|10|21}}
+ // TODO: fix problem with gYear gDate e.q. Alfred Nobel
+ else if (templateName == "Birth date and age" || templateName == "Birth date and age2" ||
+ templateName == "Death date and age" || templateName == "Birth date" ||
+ templateName == "Death date" || templateName == "Bda" || templateName == "Dob")
+ {
+ for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
+ year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+ month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+ day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+ {
+ try
+ {
+ return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+ }
+ catch
+ {
+ case e : IllegalArgumentException =>
+ }
+ }
+ }
+ // http://en.wikipedia.org/wiki/Template:BirthDeathAge
+ // {{BirthDeathAge|birth_or_death_flag|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
+ else if (templateName == "Birth Death Age")
+ {
+ // gets the text from the single textNode of the first PropertyNode
+ // {{BirthDeathAge|BIRTH_OR_DEATH_FLAG|year_of_birth|month_of_birth|day_of_birth|year_of_death|month_of_death|day_of_death|...}}
+ for (property <- node.property("1")) property.retrieveText match
+ {
+ case Some("B") =>
+ {
+ for (yearProperty <- node.property("2"); monthProperty <- node.property("3"); dayProperty <- node.property("4");
+ year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+ month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+ day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+ {
+ try
+ {
+ return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+ }
+ catch
+ {
+ case e : IllegalArgumentException =>
+ }
+ }
+ }
+ case _ =>
+ {
+ for (yearProperty <- node.property("5"); monthProperty <- node.property("6"); dayProperty <- node.property("7");
+ year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+ month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+ day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+ {
+ try
+ {
+ return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+ }
+ catch
+ {
+ case e : IllegalArgumentException =>
+ }
+ }
+ }
+ }
+ }
+ }
+ else if (language == "el")
+ {
+ //birth_year|birth_month|birth_day}} //same, parse the first 3 each time
+ //death_year|death_month|death_dat|birth_year|birth_month|birth_day}}
+ if (templateName.toLowerCase == "ηγη" || templateName.toLowerCase == "ημερομηνία γέννησης και ηλικία" ||
+ templateName.toLowerCase == "ηθηλ" || templateName.toLowerCase == "ημερομηνία θανάτου και ηλικία" ||
+ templateName.toLowerCase == "ημερομηνία γέννησης")
+ {
+ for (yearProperty <- node.property("1"); monthProperty <- node.property("2"); dayProperty <- node.property("3");
+ year <- yearProperty.children.collect{case TextNode(text, _) => text}.headOption;
+ month <- monthProperty.children.collect{case TextNode(text, _) => text}.headOption;
+ day <- dayProperty.children.collect{case TextNode(text, _) => text}.headOption)
+ {
+ try
+ {
+ return Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
+ }
+ catch
+ {
+ case e : IllegalArgumentException =>
+ }
+ }
+
+ }
+ }
logger.log(Level.FINE, "Template unknown: " + node.title);
return None
}
@@ -254,11 +289,7 @@
for(DateRegex2(day, dunno, month, year, era) <- List(input))
{
- var eraIdentifier = ""
- if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
- {
- eraIdentifier = "-"
- }
+ var eraIdentifier = getEraSign(era)
try
{
val monthNumber = months(language)(month.toLowerCase())
@@ -272,11 +303,7 @@
for(DateRegex3(month, day, year, era) <- List(input))
{
- var eraIdentifier = ""
- if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
- {
- eraIdentifier = "-"
- }
+ var eraIdentifier = getEraSign(era)
try
{
val monthNumber = months(language)(month.toLowerCase())
@@ -355,11 +382,7 @@
val month = result.group(1)
val year = result.group(2)
val era = result.group(3)
- var eraIdentifier = ""
- if ((era != null) && ((era.substring(0,2).toUpperCase == "BC" || era.substring(0,2).toUpperCase == "AC")))
- {
- eraIdentifier = "-"
- }
+ var eraIdentifier = getEraSign(era)
try
{
val monthNumber = months(language)(month.toLowerCase())
@@ -381,13 +404,8 @@
{
case yearRegex(year, era) =>
{
- var eraIdentifier = ""
+ var eraIdentifier = getEraSign(era)
-
+
- if ((era != null) && ((era.substring(0,2).toUpperCase == "BC") || (era.substring(0,2).toUpperCase == "AC")))
- {
- eraIdentifier = "-"
- }
-
return new Some(new Date(year = Some((eraIdentifier+year).toInt), datatype = datatype))
}
case _ =>
@@ -402,4 +420,19 @@
case TextNode(text, _) => text
case _ => node.children.map(nodeToString).mkString
}
+
+ private def getEraSign(input : String) : String =
+ {
+ if (input == null) return ""
+
+ // '.' is used in regex as '\\.'
+ val tmpInp = input.replace(".", "\\.").toLowerCase
+ for ( (key, value) <- eraStr(language)
+ if value==(-1))
+ {
+ if (key.toLowerCase == tmpInp.substring(0,Math.min(key.size,tmpInp.size)) )
+ return "-"
-}
\ No newline at end of file
+ }
+ return ""
+ }
+}
\ No newline at end of file
Index: core/src/test/scala/org/dbpedia/extraction/dataparser/DateTimeParserTest.scala
===================================================================
--- core/src/test/scala/org/dbpedia/extraction/dataparser/DateTimeParserTest.scala (revision 3676)
+++ core/src/test/scala/org/dbpedia/extraction/dataparser/DateTimeParserTest.scala (revision )
@@ -435,6 +435,85 @@
parse("en", "xsd:date", "grr10/10/2007bla") should equal (Some("2007-10-10"))
}
+ //greek date tests
+
+ "DataParser" should "return date (02 Μαρτίου 151)" in
+ {
+ parse("el", "xsd:date", "02 Μαρτίου 151") should equal (Some("0151-03-02"))
+ }
+ "DataParser" should "return gYear (20 π.Χ.)" in
+ {
+ parse("el", "xsd:gYear", "20 π.Χ.") should equal (Some("-0020"))
+ }
+ "DataParser" should "return gYear (20 πΧ)" in
+ {
+ parse("el", "xsd:gYear", "20 πΧ") should equal (Some("-0020"))
+ }
+ "DataParser" should "return gYear (20 Π.Χ.)" in
+ {
+ parse("el", "xsd:gYear", "20 Π.Χ.") should equal (Some("-0020"))
+ }
+ "DataParser" should "return gYear (20 ΠΧ)" in
+ {
+ parse("el", "xsd:gYear", "20 ΠΧ") should equal (Some("-0020"))
+ }
+ "DataParser" should "return gYear (20 μ.Χ.)" in
+ {
+ parse("el", "xsd:gYear", "20 μ.Χ.") should equal (Some ("0020"))
+ }
+ "DataParser" should "return gYear (20 μΧ)" in
+ {
+ parse("el", "xsd:gYear", "20 μΧ") should equal (Some ("0020"))
+ }
+ /*"DataParser" should "return gYear (14ος αιώνας)" in
+ {
+ parse("el", "xsd:gYear", "14ος αιώνας") should equal (Some("1300"))
+ }*/
+ "DataParser" should "return gMonthDay (4η ιουλίου)" in
+ {
+ parse("el", "xsd:gMonthDay", "4η ιουλίου") should equal (Some("--07-04"))
+ }
+ "DataParser" should "return gYearMonth (σεπτέμβριος 2007)" in
+ {
+ parse("el", "xsd:gYearMonth", "σεπτέμβριος 2007") should equal (Some("2007-09"))
+ }
+ "DataParser" should "return gYearMonth (1[[429 ιανουαρίου]] [[300 μ.Χ.]])" in
+ {
+ parse("el", "xsd:gYearMonth", "1[[429 ιανουαρίου]] [[300 μ.Χ.]]") should equal (Some("0300-01"))
+ }
+ "DataParser" should "return date (ιούνιος, 21 2007 π.Χ.)" in
+ {
+ parse("el", "xsd:date", "ιούνιος, 21 2007 π.Χ.") should equal (Some("-2007-06-21"))
+ }
+ "DataParser" should "return date (1η δεκεμβρίου 2006)" in
+ {
+ parse("el", "xsd:date", "1η δεκεμβρίου 2006") should equal (Some("2006-12-01"))
+ }
+ "DataParser" should "return date ([[1η μαΐου]] [[2006]])" in
+ {
+ parse("el", "xsd:date", "[[1η μαΐου]] [[2006]]") should equal (Some("2006-05-01"))
+ }
+ "DataParser" should "return date (12 ιουνίου 2008)" in
+ {
+ parse("el", "xsd:date", "12 ιουνίου 2008") should equal (Some("2008-06-12"))
+ }
+ "DataParser" should "return date (12 ιούνιος 2008)" in
+ {
+ parse("el", "xsd:date", "12 ιούνιος 2008") should equal (Some("2008-06-12"))
+ }
+ "DataParser" should "return date ([[2 Νοεμβρίου]] [[1911]])" in
+ {
+ parse("el", "xsd:date", "[[2 Νοεμβρίου]] [[1911]]") should equal (Some("1911-11-02"))
+ }
+ "DataParser" should "return date ({{ηθηλ|1996|03|18|1911|11|2}})" in
+ {
+ parse("el", "xsd:date", "{{ηθηλ|1996|03|18|1911|11|2}}") should equal (Some("1996-03-18"))
+ }
+ "DataParser" should "return date ({{ηγη|1996|03|18}})" in
+ {
+ parse("el", "xsd:date", "{{ηθηλ|1996|03|18}}") should equal (Some("1996-03-18"))
+ }
+
private val wikiParser = WikiParser()
private def parse(language : String, datatype : String, input : String) : Option[String] =
@@ -447,4 +526,4 @@
dateParser.parse(wikiParser(page)).map(_.toString)
}
-}
\ No newline at end of file
+}
------------------------------------------------------------------------------
Forrester recently released a report on the Return on Investment (ROI) of
Google Apps. They found a 300% ROI, 38%-56% cost savings, and break-even
within 7 months. Over 3 million businesses have gone Google with Google Apps:
an online email calendar, and document program that's accessible from your
browser. Read the Forrester report: http://p.sf.net/sfu/googleapps-sfnew
_______________________________________________
Dbpedia-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/dbpedia-discussion