Github user davies commented on a diff in the pull request:
https://github.com/apache/spark/pull/2819#discussion_r19435144
--- Diff: python/pyspark/mllib/feature.py ---
@@ -18,59 +18,324 @@
"""
Python package for feature in MLlib.
"""
+import warnings
+
+from py4j.protocol import Py4JJavaError
+from py4j.java_gateway import JavaObject
+
+from pyspark import RDD, SparkContext
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
-from pyspark.mllib.linalg import _convert_to_vector, _to_java_object_rdd
+from pyspark.mllib.linalg import Vectors, _to_java_object_rdd
+
+__all__ = ['Normalizer', 'StandardScalerModel', 'StandardScaler',
+ 'HashTF', 'IDFModel', 'IDF',
+ 'Word2Vec', 'Word2VecModel']
+
+
+# TODO: move these helper functions into utils
+_picklable_classes = [
+ 'LinkedList',
+ 'SparseVector',
+ 'DenseVector',
+ 'DenseMatrix',
+ 'Rating',
+ 'LabeledPoint',
+]
+
+
+def _py2java(sc, a):
+ """ Convert Python object into Java """
+ if isinstance(a, RDD):
+ a = _to_java_object_rdd(a)
+ elif not isinstance(a, (int, long, float, bool, basestring)):
+ bytes = bytearray(PickleSerializer().dumps(a))
+ a = sc._jvm.SerDe.loads(bytes)
+ return a
+
-__all__ = ['Word2Vec', 'Word2VecModel']
+def _java2py(sc, r):
+ if isinstance(r, JavaObject):
+ clsName = r.getClass().getSimpleName()
+ if clsName in ("RDD", "JavaRDD"):
+ if clsName == "RDD":
+ r = r.toJavaRDD()
+ jrdd = sc._jvm.SerDe.javaToPython(r)
+ return RDD(jrdd, sc, AutoBatchedSerializer(PickleSerializer()))
+ elif clsName in _picklable_classes:
+ r = sc._jvm.SerDe.dumps(r)
-class Word2VecModel(object):
+ if isinstance(r, bytearray):
+ r = PickleSerializer().loads(str(r))
+ return r
+
+
+def _callJavaFunc(sc, func, *args):
+ """ Call Java Function
"""
- class for Word2Vec model
+ args = [_py2java(sc, a) for a in args]
+ return _java2py(sc, func(*args))
+
+
+def _callAPI(sc, name, *args):
+ """ Call API in PythonMLLibAPI
"""
- def __init__(self, sc, java_model):
+ api = getattr(sc._jvm.PythonMLLibAPI(), name)
+ return _callJavaFunc(sc, api, *args)
+
+
+class VectorTransformer(object):
+ """
+ :: DeveloperApi ::
+ Base class for transformation of a vector or RDD of vector
+ """
+ def transform(self, vector):
"""
- :param sc: Spark context
- :param java_model: Handle to Java model object
+ Applies transformation on a vector.
+
+ :param vector: vector to be transformed.
"""
+ raise NotImplementedError
+
+
+class Normalizer(VectorTransformer):
+ """
+ :: Experimental ::
+ Normalizes samples individually to unit L^p^ norm
+
+ For any 1 <= p < Double.PositiveInfinity, normalizes samples using
+ sum(abs(vector).^p^)^(1/p)^ as norm.
+
+ For p = Double.PositiveInfinity, max(abs(vector)) will be used as
+ norm for normalization.
+
+ >>> v = Vectors.dense(range(3))
+ >>> nor = Normalizer(1)
+ >>> nor.transform(v)
+ DenseVector([0.0, 0.3333, 0.6667])
+
+ >>> rdd = sc.parallelize([v])
+ >>> nor.transform(rdd).collect()
+ [DenseVector([0.0, 0.3333, 0.6667])]
+ """
+ def __init__(self, p=2):
+ """
+ :param p: Normalization in L^p^ space, p = 2 by default.
+ """
+ assert p >= 1.0, "p should be greater than 1.0"
+ self.p = float(p)
+
+ def transform(self, vector):
+ """
+ Applies unit length normalization on a vector.
+
+ :param vector: vector to be normalized.
+ :return: normalized vector. If the norm of the input is zero, it
+ will return the input vector.
+ """
+ sc = SparkContext._active_spark_context
+ assert sc is not None, "SparkContext should be initialized first"
+ return _callAPI(sc, "normalizeVector", self.p, vector)
+
+
+class JavaModelWrapper(VectorTransformer):
--- End diff --
I would like to do this refactor after merging this PR, other modules also
need updates.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]