Github user dbtsai commented on a diff in the pull request:
https://github.com/apache/spark/pull/14834#discussion_r78128581
--- Diff:
mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
---
@@ -460,33 +564,74 @@ class LogisticRegression @Since("1.2.0") (
as a result, no scaling is needed.
*/
val rawCoefficients = state.x.toArray.clone()
- var i = 0
- while (i < numFeatures) {
- rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 /
featuresStd(i) else 0.0 }
- i += 1
+ val coefficientArray = Array.tabulate(numCoefficientSets *
numFeatures) { i =>
+ // flatIndex will loop though rawCoefficients, and skip the
intercept terms.
+ val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
+ val featureIndex = i % numFeatures
+ if (featuresStd(featureIndex) != 0.0) {
+ rawCoefficients(flatIndex) / featuresStd(featureIndex)
+ } else {
+ 0.0
+ }
+ }
+ val coefficientMatrix =
+ new DenseMatrix(numCoefficientSets, numFeatures,
coefficientArray, isTransposed = true)
+
+ if ($(regParam) == 0.0 && isMultinomial) {
+ /*
+ When no regularization is applied, the coefficients lack
identifiability because
+ we do not use a pivot class. We can add any constant value to
the coefficients and
+ get the same likelihood. So here, we choose the mean centered
coefficients for
+ reproducibility. This method follows the approach in glmnet,
described here:
+
+ Friedman, et al. "Regularization Paths for Generalized Linear
Models via
+ Coordinate Descent,"
https://core.ac.uk/download/files/153/6287975.pdf
+ */
+ val coefficientMean = coefficientMatrix.values.sum /
coefficientMatrix.values.length
+ coefficientMatrix.update(_ - coefficientMean)
}
- bcFeaturesStd.destroy(blocking = false)
- if ($(fitIntercept)) {
- (Vectors.dense(rawCoefficients.dropRight(1)).compressed,
rawCoefficients.last,
- arrayBuilder.result())
+ val interceptsArray: Array[Double] = if ($(fitIntercept)) {
+ Array.tabulate(numCoefficientSets) { i =>
+ val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1
+ rawCoefficients(coefIndex)
+ }
+ } else {
+ Array[Double]()
+ }
+ /*
+ The intercepts are never regularized, so we always center the
mean.
+ */
+ val interceptVector = if (interceptsArray.nonEmpty &&
isMultinomial) {
+ val interceptMean = interceptsArray.sum / numClasses
+ interceptsArray.indices.foreach { i => interceptsArray(i) -=
interceptMean }
+ Vectors.dense(interceptsArray)
+ } else if (interceptsArray.length == 1) {
+ Vectors.dense(interceptsArray)
} else {
- (Vectors.dense(rawCoefficients).compressed, 0.0,
arrayBuilder.result())
+ Vectors.sparse(numCoefficientSets, Seq())
}
+ (coefficientMatrix, interceptVector, arrayBuilder.result())
}
}
if (handlePersistence) instances.unpersist()
- val model = copyValues(new LogisticRegressionModel(uid, coefficients,
intercept))
- val (summaryModel, probabilityColName) =
model.findSummaryModelAndProbabilityCol()
- val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
- summaryModel.transform(dataset),
- probabilityColName,
- $(labelCol),
- $(featuresCol),
- objectiveHistory)
--- End diff --
Change the outer
```scala
val (coefficients, intercept, objectiveHistory) = {
.....
}
```
to
```scala
val (coefficientMatrix, interceptVector, objectiveHistory) = {
.....
}
```
for clarity.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]