model

GitBox Tue, 24 Nov 2020 16:29:07 -0800


liangz1 commented on a change in pull request #30471:
URL: https://github.com/apache/spark/pull/30471#discussion_r530029579




##########
File path: python/pyspark/ml/tuning.py
##########
@@ -207,6 +210,205 @@ def _to_java_impl(self):
         return java_estimator, java_epms, java_evaluator
 
 
+class _ValidatorSharedReadWrite:
+
+    @staticmethod
+    def saveImpl(path, instance, sc, extraMetadata=None):
+        from pyspark.ml.classification import OneVsRest
+        numParamsNotJson = 0
+        jsonEstimatorParamMaps = []
+        for paramMap in instance.getEstimatorParamMaps():
+            jsonParamMap = []
+            for p, v in paramMap.items():
+                jsonParam = {'parent': p.parent, 'name': p.name}
+                if (isinstance(v, Estimator) and not (
+                        isinstance(v, _ValidatorParams) or
+                        isinstance(v, OneVsRest))
+                    ) or isinstance(v, Transformer) or \

Review comment:
       The Validators class will directly take `Estimator` and `Evaluator`, and 
the `Transformer` will be part of the pipeline Estimator. Should the 
`Transformer` params be part of the pipeline params?

##########
File path: python/pyspark/ml/util.py
##########
@@ -554,19 +564,72 @@ def getAndSetParams(instance, metadata):
                 paramValue = metadata['defaultParamMap'][paramName]
                 instance._setDefault(**{paramName: paramValue})
 
+    @staticmethod
+    def isPythonParamsInstance(metadata):
+        return 'language' in metadata['paramMap'] and \
+               metadata['paramMap']['language'].lower() == 'python'
+
     @staticmethod
     def loadParamsInstance(path, sc):
         """
         Load a :py:class:`Params` instance from the given path, and return it.
         This assumes the instance inherits from :py:class:`MLReadable`.
         """
         metadata = DefaultParamsReader.loadMetadata(path, sc)
-        pythonClassName = metadata['class'].replace("org.apache.spark", 
"pyspark")
+        if DefaultParamsReader.isPythonParamsInstance(metadata):
+            pythonClassName = metadata['class']
+        else:
+            pythonClassName = metadata['class'].replace("org.apache.spark", 
"pyspark")
         py_type = DefaultParamsReader.__get_class(pythonClassName)
         instance = py_type.load(path)
         return instance
 
 
+class MetaAlgorithmReadWrite:
+
+    @staticmethod
+    def getUidMap(instance):
+        uidList = MetaAlgorithmReadWrite.getUidMapImpl(instance)
+        uidMap = dict(uidList)
+        if len(uidList) != len(uidMap):
+            raise 
RuntimeError(f'{instance.__class__.__module__}.{instance.__class__.__name__}'
+                               f'.load found a compound estimator with stages 
with duplicate '
+                               f'UIDs. List of UIDs: {list(uidMap.keys())}.')
+        return uidMap
+
+    @staticmethod
+    def getUidMapImpl(instance):

Review comment:
       Naming it as `getUidList` sounds more natural to me.

##########
File path: python/pyspark/ml/tuning.py
##########
@@ -207,6 +210,205 @@ def _to_java_impl(self):
         return java_estimator, java_epms, java_evaluator
 
 
+class _ValidatorSharedReadWrite:
+
+    @staticmethod
+    def saveImpl(path, instance, sc, extraMetadata=None):
+        from pyspark.ml.classification import OneVsRest
+        numParamsNotJson = 0
+        jsonEstimatorParamMaps = []
+        for paramMap in instance.getEstimatorParamMaps():
+            jsonParamMap = []
+            for p, v in paramMap.items():
+                jsonParam = {'parent': p.parent, 'name': p.name}
+                if (isinstance(v, Estimator) and not (
+                        isinstance(v, _ValidatorParams) or
+                        isinstance(v, OneVsRest))
+                    ) or isinstance(v, Transformer) or \
+                        isinstance(Evaluator):
+                    relative_path = f'epm_{p.name}{numParamsNotJson}'
+                    param_path = os.path.join(path, relative_path)
+                    numParamsNotJson += 1
+                    v.save(param_path)
+                    jsonParam['value'] = relative_path
+                    jsonParam['isJson'] = False
+                elif isinstance(v, MLWritable):
+                    raise RuntimeError(
+                        "ValidatorSharedReadWrite.saveImpl does not handle 
parameters of type: "
+                        "MLWritable that are not 
Estimaor/Evaluator/Transformer, and if parameter is estimator,"
+                        "it cannot be Validator or OneVsRest")

Review comment:
       It would be clearer if we can create an interface similar to 
`DefaultParamsWritable` and do not use
   ~~~
                  if (isinstance(v, Estimator) and not (
                           isinstance(v, _ValidatorParams) or
                           isinstance(v, OneVsRest))
                       ) or isinstance(v, Transformer) or \
                           isinstance(Evaluator):
   ~~~
   since it looks very confusing.

##########
File path: python/pyspark/ml/tuning.py
##########
@@ -207,6 +210,205 @@ def _to_java_impl(self):
         return java_estimator, java_epms, java_evaluator
 
 
+class _ValidatorSharedReadWrite:
+
+    @staticmethod
+    def saveImpl(path, instance, sc, extraMetadata=None):
+        from pyspark.ml.classification import OneVsRest
+        numParamsNotJson = 0
+        jsonEstimatorParamMaps = []
+        for paramMap in instance.getEstimatorParamMaps():
+            jsonParamMap = []

Review comment:
       L219 and L221 are called maps but are actually lists. Can you fix the 
names?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] liangz1 commented on a change in pull request #30471: [WIP][SPARK-33520][ML] make CrossValidator/TrainValidateSplit support Python backend estimator/model

Reply via email to