ozancicek commented on a change in pull request #24939: [SPARK-18569][ML][R] 
Support RFormula arithmetic, I() and spark functions
URL: https://github.com/apache/spark/pull/24939#discussion_r320793640
 
 

 ##########
 File path: 
mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
 ##########
 @@ -269,40 +310,78 @@ private[ml] object RFormulaParser extends RegexParsers {
   }
 
   private val intercept: Parser[Term] =
-    "([01])".r ^^ { case a => Intercept(a == "1") }
+    skipSpace("([01])".r) ^^ { case a => Intercept(a == "1") }
 
   private val columnRef: Parser[ColumnRef] =
-    "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r ^^ { case a => ColumnRef(a) }
+    skipSpace("([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r) ^^ { case a => 
ColumnRef(a) }
 
-  private val empty: Parser[ColumnRef] = "" ^^ { case a => ColumnRef("") }
+  private val empty: Parser[ColumnRef] = skipSpace("".r) ^^ { case a => 
ColumnRef("") }
 
-  private val label: Parser[ColumnRef] = columnRef | empty
+  private val label: Parser[Label] = evalExpr | columnRef | empty
 
-  private val dot: Parser[Term] = "\\.".r ^^ { case _ => Dot }
+  private val dot: Parser[Term] = skipSpace("\\.".r) ^^ { case _ => Dot }
 
-  private val parens: Parser[Term] = "(" ~> expr <~ ")"
+  private val parens: Parser[Term] = skipSpace("\\(".r) ~> expr <~ 
skipSpace("\\)".r)
 
-  private val term: Parser[Term] = parens | intercept | columnRef | dot
+  private val term: Parser[Term] = evalExpr | parens | intercept | columnRef | 
dot
 
-  private val pow: Parser[Term] = term ~ "^" ~ "^[1-9]\\d*".r ^^ {
+  private val pow: Parser[Term] = term ~ "^" ~ skipSpace("^[1-9]\\d*".r) ^^ {
     case base ~ "^" ~ degree => power(base, degree.toInt)
   } | term
 
-  private val interaction: Parser[Term] = pow * (":" ^^^ { interact _ })
+  private val interaction: Parser[Term] = pow * (skipSpace("\\:".r) ^^^ { 
interact _ })
 
-  private val factor = interaction * ("*" ^^^ { cross _ })
+  private val factor = interaction * (skipSpace("\\*".r) ^^^ { cross _ })
 
-  private val sum = factor * ("+" ^^^ { add _ } |
-    "-" ^^^ { subtract _ })
+  private val sum = factor * (skipSpace("\\+".r) ^^^ { add _ } |
+    skipSpace("\\-".r) ^^^ { subtract _ })
 
   private val expr = (sum | term)
 
-  private val formula: Parser[ParsedRFormula] =
-    (label ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, 
t.asTerms.terms) }
+  private val formula: Parser[ParsedRFormula] = (label ~ skipSpace("\\~".r) ~ 
expr) ^^ {
+    case r ~ "~" ~ t => ParsedRFormula(r, t.asTerms.terms) }
 
   def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
     case Success(result, _) => result
     case failure: NoSuccess => throw new IllegalArgumentException(
       "Could not parse formula: " + value)
   }
 }
+
+/**
+ * Parser for evaluated expressions in a formula. An evaluated expression is
+ * any alphanumeric identifiers followed by (), e.g. `func123(a+b, func())`,
+ * or anything inside `I()`. A valid expression is any string-parentheses 
product,
+ * such that if there are any parentheses ('(' or ')') they're all balanced.
+ */
+private[ml] trait EvalExprParser extends RegexParsers {
 
 Review comment:
   This one is responsible for parsing the function identifier, parenthesis and 
everything inside the parenthesis. Whatever inside the parenthesis is parsed 
with whitespaces, so no need to skip there. Function identifiers don't have 
whitespaces as well. One thing to additionally skip might be the whitespace 
between the function identifier and the trailing parenthesis, such as;
   
   `y ~ I ( log(b) )`
   or
   `y ~ exp (b)`
   
   Currently, the above formulas would fail to parse due to the space between 
the parenthesis and function identifier, e.g `I (` and `exp (`. Should these be 
allowed? It's not so hard to enable these, but I haven't really given a good 
thought on what might be unwanted side effects.
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to