[GitHub] spark pull request: [SPARK-13784][ML] Persistence for RandomForest...

hhbyyh Sun, 03 Apr 2016 09:21:41 -0700

Github user hhbyyh commented on a diff in the pull request:

    https://github.com/apache/spark/pull/12118#discussion_r58312723
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
 ---
    @@ -236,12 +255,66 @@ final class RandomForestClassificationModel 
private[ml] (
       private[ml] def toOld: OldRandomForestModel = {
         new OldRandomForestModel(OldAlgo.Classification, _trees.map(_.toOld))
       }
    +
    +  @Since("2.0.0")
    +  override def write: MLWriter =
    +    new 
RandomForestClassificationModel.RandomForestClassificationModelWriter(this)
     }
     
    -private[ml] object RandomForestClassificationModel {
    +@Since("2.0.0")
    +object RandomForestClassificationModel extends 
MLReadable[RandomForestClassificationModel] {
    +
    +  @Since("2.0.0")
    +  override def read: MLReader[RandomForestClassificationModel] =
    +    new RandomForestClassificationModelReader
    +
    +  @Since("2.0.0")
    +  override def load(path: String): RandomForestClassificationModel = 
super.load(path)
    +
    +  private[RandomForestClassificationModel]
    +  class RandomForestClassificationModelWriter(instance: 
RandomForestClassificationModel)
    +    extends MLWriter {
    +
    +    override protected def saveImpl(path: String): Unit = {
    +      // Note: numTrees is not currently used, but could be nice to store 
for fast querying.
    +      val extraMetadata: JObject = Map(
    +        "numFeatures" -> instance.numFeatures,
    +        "numClasses" -> instance.numClasses,
    +        "numTrees" -> instance.getNumTrees)
    +      EnsembleModelReadWrite.saveImpl(instance, path, sqlContext, 
extraMetadata)
    +    }
    +  }
    +
    +  private class RandomForestClassificationModelReader
    +    extends MLReader[RandomForestClassificationModel] {
    +
    +    /** Checked against metadata when loading model */
    +    private val className = 
classOf[RandomForestClassificationModel].getName
    +    private val treeClassName = 
classOf[DecisionTreeClassificationModel].getName
    +
    +    override def load(path: String): RandomForestClassificationModel = {
    +      implicit val format = DefaultFormats
    +      val (metadata: Metadata, treesData: Array[(Metadata, Node)]) =
    +        EnsembleModelReadWrite.loadImpl(path, sqlContext, className, 
treeClassName)
    +      val numFeatures = (metadata.metadata \ "numFeatures").extract[Int]
    +      val numClasses = (metadata.metadata \ "numClasses").extract[Int]
    +
    --- End diff --
    
    just IMO, maybe check numTrees == trees.length since there's redundant 
information.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-13784][ML] Persistence for RandomForest...

Reply via email to