ozancicek commented on a change in pull request #24939: [SPARK-18569][ML][R] Support RFormula arithmetic, I() and spark functions URL: https://github.com/apache/spark/pull/24939#discussion_r320793640
########## File path: mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala ########## @@ -269,40 +310,78 @@ private[ml] object RFormulaParser extends RegexParsers { } private val intercept: Parser[Term] = - "([01])".r ^^ { case a => Intercept(a == "1") } + skipSpace("([01])".r) ^^ { case a => Intercept(a == "1") } private val columnRef: Parser[ColumnRef] = - "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r ^^ { case a => ColumnRef(a) } + skipSpace("([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r) ^^ { case a => ColumnRef(a) } - private val empty: Parser[ColumnRef] = "" ^^ { case a => ColumnRef("") } + private val empty: Parser[ColumnRef] = skipSpace("".r) ^^ { case a => ColumnRef("") } - private val label: Parser[ColumnRef] = columnRef | empty + private val label: Parser[Label] = evalExpr | columnRef | empty - private val dot: Parser[Term] = "\\.".r ^^ { case _ => Dot } + private val dot: Parser[Term] = skipSpace("\\.".r) ^^ { case _ => Dot } - private val parens: Parser[Term] = "(" ~> expr <~ ")" + private val parens: Parser[Term] = skipSpace("\\(".r) ~> expr <~ skipSpace("\\)".r) - private val term: Parser[Term] = parens | intercept | columnRef | dot + private val term: Parser[Term] = evalExpr | parens | intercept | columnRef | dot - private val pow: Parser[Term] = term ~ "^" ~ "^[1-9]\\d*".r ^^ { + private val pow: Parser[Term] = term ~ "^" ~ skipSpace("^[1-9]\\d*".r) ^^ { case base ~ "^" ~ degree => power(base, degree.toInt) } | term - private val interaction: Parser[Term] = pow * (":" ^^^ { interact _ }) + private val interaction: Parser[Term] = pow * (skipSpace("\\:".r) ^^^ { interact _ }) - private val factor = interaction * ("*" ^^^ { cross _ }) + private val factor = interaction * (skipSpace("\\*".r) ^^^ { cross _ }) - private val sum = factor * ("+" ^^^ { add _ } | - "-" ^^^ { subtract _ }) + private val sum = factor * (skipSpace("\\+".r) ^^^ { add _ } | + skipSpace("\\-".r) ^^^ { subtract _ }) private val expr = (sum | term) - private val formula: Parser[ParsedRFormula] = - (label ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t.asTerms.terms) } + private val formula: Parser[ParsedRFormula] = (label ~ skipSpace("\\~".r) ~ expr) ^^ { + case r ~ "~" ~ t => ParsedRFormula(r, t.asTerms.terms) } def parse(value: String): ParsedRFormula = parseAll(formula, value) match { case Success(result, _) => result case failure: NoSuccess => throw new IllegalArgumentException( "Could not parse formula: " + value) } } + +/** + * Parser for evaluated expressions in a formula. An evaluated expression is + * any alphanumeric identifiers followed by (), e.g. `func123(a+b, func())`, + * or anything inside `I()`. A valid expression is any string-parentheses product, + * such that if there are any parentheses ('(' or ')') they're all balanced. + */ +private[ml] trait EvalExprParser extends RegexParsers { Review comment: This one is responsible for parsing the function identifier, parenthesis and everything inside the parenthesis. Whatever inside the parenthesis is parsed with whitespaces, so no need to skip there. Function identifiers don't have whitespaces as well. One thing to additionally skip might be the whitespace between the function identifier and the trailing parenthesis, such as; `y ~ I ( log(b) )` or `y ~ exp (b)` Currently, the above formulas would fail to parse due to the space between the parenthesis and function identifier, e.g `I (` and `exp (`. Should these be allowed? It's not so hard to enable these, but I haven't really given a good thought on what might be unwanted side effects. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org