zhengruifeng commented on a change in pull request #25929: [SPARK-29116][PYTHON][ML] Refactor py classes related to DecisionTree URL: https://github.com/apache/spark/pull/25929#discussion_r332318508
########## File path: python/pyspark/ml/tree.py ########## @@ -0,0 +1,348 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark import since, keyword_only +from pyspark.ml.param.shared import * +from pyspark.ml.util import * +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \ + JavaPredictor, JavaPredictionModel +from pyspark.ml.common import inherit_doc, _java2py, _py2java + + +@inherit_doc +class DecisionTreeModel(JavaPredictionModel): + """ + Abstraction for Decision Tree models. + .. versionadded:: 1.5.0 + """ + + @property + @since("1.5.0") + def numNodes(self): + """Return number of nodes of the decision tree.""" + return self._call_java("numNodes") + + @property + @since("1.5.0") + def depth(self): + """Return depth of the decision tree.""" + return self._call_java("depth") + + @property + @since("2.0.0") + def toDebugString(self): + """Full description of model.""" + return self._call_java("toDebugString") + + @since("3.0.0") + def predictLeaf(self, value): + """ + Predict the indices of the leaves corresponding to the feature vector. + """ + return self._call_java("predictLeaf", value) + + def __repr__(self): + return self._call_java("toString") + + +class DecisionTreeParams(HasCheckpointInterval, HasSeed, HasWeightCol): + """ + Mixin for Decision Tree parameters. + """ + + leafCol = Param(Params._dummy(), "leafCol", "Leaf indices column name. Predicted leaf " + + "index of each instance in each tree by preorder.", + typeConverter=TypeConverters.toString) + + maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., " + + "depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", + typeConverter=TypeConverters.toInt) + + maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous " + + "features. Must be >=2 and >= number of categories for any categorical " + + "feature.", typeConverter=TypeConverters.toInt) + + minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of " + + "instances each child must have after split. If a split causes " + + "the left or right child to have fewer than " + + "minInstancesPerNode, the split will be discarded as invalid. " + + "Should be >= 1.", typeConverter=TypeConverters.toInt) + + minWeightFractionPerNode = Param(Params._dummy(), "minWeightFractionPerNode", "Minimum " + "fraction of the weighted sample count that each child " + "must have after split. If a split causes the fraction " + "of the total weight in the left or right child to be " + "less than minWeightFractionPerNode, the split will be " + "discarded as invalid. Should be in interval [0.0, 0.5).", + typeConverter=TypeConverters.toFloat) + + minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split " + + "to be considered at a tree node.", typeConverter=TypeConverters.toFloat) + + maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to " + + "histogram aggregation. If too small, then 1 node will be split per " + + "iteration, and its aggregates may exceed this size.", + typeConverter=TypeConverters.toInt) + + cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass " + + "trees to executors to match instances with nodes. If true, the " + + "algorithm will cache node IDs for each instance. Caching can speed " + + "up training of deeper trees. Users can set how often should the cache " + + "be checkpointed or disable it by setting checkpointInterval.", + typeConverter=TypeConverters.toBoolean) + + def __init__(self): + super(DecisionTreeParams, self).__init__() + + def setLeafCol(self, value): + """ + Sets the value of :py:attr:`leafCol`. + """ + return self._set(leafCol=value) + + def getLeafCol(self): + """ + Gets the value of leafCol or its default value. + """ + return self.getOrDefault(self.leafCol) + + def getMaxDepth(self): + """ + Gets the value of maxDepth or its default value. + """ + return self.getOrDefault(self.maxDepth) + + def getMaxBins(self): + """ + Gets the value of maxBins or its default value. + """ + return self.getOrDefault(self.maxBins) + + def getMinInstancesPerNode(self): + """ + Gets the value of minInstancesPerNode or its default value. + """ + return self.getOrDefault(self.minInstancesPerNode) + + def getMinWeightFractionPerNode(self): + """ + Gets the value of minWeightFractionPerNode or its default value. + """ + return self.getOrDefault(self.minWeightFractionPerNode) + + def getMinInfoGain(self): + """ + Gets the value of minInfoGain or its default value. + """ + return self.getOrDefault(self.minInfoGain) + + def getMaxMemoryInMB(self): + """ + Gets the value of maxMemoryInMB or its default value. + """ + return self.getOrDefault(self.maxMemoryInMB) + + def getCacheNodeIds(self): + """ + Gets the value of cacheNodeIds or its default value. + """ + return self.getOrDefault(self.cacheNodeIds) + + +@inherit_doc +class TreeEnsembleModel(JavaPredictionModel): Review comment: ditto ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
