srowen commented on a change in pull request #21632: [SPARK-19591][ML][MLlib]
Add sample weights to decision trees
URL: https://github.com/apache/spark/pull/21632#discussion_r243593058
##########
File path:
mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
##########
@@ -77,17 +77,37 @@ abstract class Classifier[
* @note Throws `SparkException` if any label is a non-integer or is negative
*/
protected def extractLabeledPoints(dataset: Dataset[_], numClasses: Int):
RDD[LabeledPoint] = {
- require(numClasses > 0, s"Classifier (in extractLabeledPoints) found
numClasses =" +
- s" $numClasses, but requires numClasses > 0.")
+ validateNumClasses(numClasses)
dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map {
case Row(label: Double, features: Vector) =>
- require(label % 1 == 0 && label >= 0 && label < numClasses,
s"Classifier was given" +
- s" dataset with invalid label $label. Labels must be integers in
range" +
- s" [0, $numClasses).")
+ validateLabel(label, numClasses)
LabeledPoint(label, features)
}
}
+ /**
+ * Validates that number of classes is greater than zero.
+ *
+ * @param numClasses Number of classes label can take.
+ */
+ protected def validateNumClasses(numClasses: Int): Unit = {
+ require(numClasses > 0, s"Classifier (in extractLabeledPoints) found
numClasses =" +
+ s" $numClasses, but requires numClasses > 0.")
+ }
+
+ /**
+ * Validates the label on the classifier is a valid integer in the range [0,
numClasses).
+ *
+ * @param label The label to validate.
+ * @param numClasses Number of classes label can take. Labels must be
integers in the range
+ * [0, numClasses).
+ */
+ protected def validateLabel(label: Double, numClasses: Int): Unit = {
+ require(label % 1 == 0 && label >= 0 && label < numClasses, s"Classifier
was given" +
Review comment:
I still think `% 1` is cryptic... just compare the value to its `.toLong` or
something
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]