[MINOR] Refactored the Python code to match PEP8 guidelines and bugfix in Keras2DML for Python 3.5
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/9e7ee19a Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/9e7ee19a Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/9e7ee19a Branch: refs/heads/master Commit: 9e7ee19a45102f7cbb37507da25b1ba0641868fd Parents: ddc3672 Author: Niketan Pansare <[email protected]> Authored: Mon Aug 27 17:26:55 2018 -0700 Committer: Niketan Pansare <[email protected]> Committed: Mon Aug 27 17:34:17 2018 -0700 ---------------------------------------------------------------------- src/main/python/systemml/__init__.py | 4 +- src/main/python/systemml/classloader.py | 90 ++- src/main/python/systemml/converters.py | 170 ++++-- src/main/python/systemml/defmatrix.py | 565 ++++++++++++------- src/main/python/systemml/mlcontext.py | 229 +++++--- src/main/python/systemml/mllearn/__init__.py | 4 +- src/main/python/systemml/mllearn/estimators.py | 357 ++++++++---- src/main/python/systemml/mllearn/keras2caffe.py | 553 +++++++++++------- src/main/python/systemml/random/__init__.py | 6 +- src/main/python/systemml/random/sampling.py | 56 +- 10 files changed, 1332 insertions(+), 702 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/9e7ee19a/src/main/python/systemml/__init__.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/__init__.py b/src/main/python/systemml/__init__.py index 04d521bb..74c8e50 100644 --- a/src/main/python/systemml/__init__.py +++ b/src/main/python/systemml/__init__.py @@ -1,4 +1,4 @@ -#------------------------------------------------------------- +# ------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. # -#------------------------------------------------------------- +# ------------------------------------------------------------- from .mlcontext import * from .defmatrix import * http://git-wip-us.apache.org/repos/asf/systemml/blob/9e7ee19a/src/main/python/systemml/classloader.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/classloader.py b/src/main/python/systemml/classloader.py index 229664d..1efbdd1 100644 --- a/src/main/python/systemml/classloader.py +++ b/src/main/python/systemml/classloader.py @@ -1,4 +1,4 @@ -#------------------------------------------------------------- +# ------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -17,14 +17,21 @@ # specific language governing permissions and limitations # under the License. # -#------------------------------------------------------------- +# ------------------------------------------------------------- -__all__ = ['createJavaObject', 'jvm_stdout', 'default_jvm_stdout', 'default_jvm_stdout_parallel_flush', 'set_default_jvm_stdout', 'get_spark_context' ] +__all__ = [ + 'createJavaObject', + 'jvm_stdout', + 'default_jvm_stdout', + 'default_jvm_stdout_parallel_flush', + 'set_default_jvm_stdout', + 'get_spark_context'] import os import numpy as np import pandas as pd -import threading, time +import threading +import time try: import py4j.java_gateway @@ -32,9 +39,12 @@ try: from pyspark import SparkContext from pyspark.sql import SparkSession except ImportError: - raise ImportError('Unable to import `pyspark`. Hint: Make sure you are running with PySpark.') + raise ImportError( + 'Unable to import `pyspark`. Hint: Make sure you are running with PySpark.') _loadedSystemML = False + + def get_spark_context(): """ Internal method to get already initialized SparkContext. Developers should always use @@ -54,10 +64,13 @@ def get_spark_context(): return sc else: raise Exception('Expected spark context to be created.') - + + _in_jvm_stdout = False default_jvm_stdout = True default_jvm_stdout_parallel_flush = True + + def set_default_jvm_stdout(enable, parallel_flush=True): """ This is useful utility method to get the output of the driver JVM from within a Jupyter notebook @@ -66,18 +79,20 @@ def set_default_jvm_stdout(enable, parallel_flush=True): ---------- enable: boolean Should flush the stdout by default when mlcontext.execute is invoked - + parallel_flush: boolean Should flush the stdout in parallel """ global default_jvm_stdout, default_jvm_stdout_parallel_flush default_jvm_stdout = enable default_jvm_stdout_parallel_flush = parallel_flush - + # This is useful utility class to get the output of the driver JVM from within a Jupyter notebook # Example usage: # with jvm_stdout(): # ml.execute(script) + + class jvm_stdout(object): """ This is useful utility class to get the output of the driver JVM from within a Jupyter notebook @@ -87,24 +102,25 @@ class jvm_stdout(object): parallel_flush: boolean Should flush the stdout in parallel """ + def __init__(self, parallel_flush=False): self.util = get_spark_context()._jvm.org.apache.sysml.api.ml.Utils() self.parallel_flush = parallel_flush self.t = threading.Thread(target=self.flush_stdout) self.stop = False - + def flush_stdout(self): - while not self.stop: - time.sleep(1) # flush stdout every 1 second + while not self.stop: + time.sleep(1) # flush stdout every 1 second str = self.util.flushStdOut() if str != '': str = str[:-1] if str.endswith('\n') else str print(str) - + def __enter__(self): global _in_jvm_stdout if _in_jvm_stdout: - # Allow for nested jvm_stdout + # Allow for nested jvm_stdout self.donotRedirect = True else: self.donotRedirect = False @@ -124,32 +140,41 @@ class jvm_stdout(object): _initializedSparkSession = False + + def _createJavaObject(sc, obj_type): # ----------------------------------------------------------------------------------- # Avoids race condition between locking of metastore_db of Scala SparkSession and PySpark SparkSession. - # This is done at toDF() rather than import level to avoid creation of SparkSession in worker processes. + # This is done at toDF() rather than import level to avoid creation of + # SparkSession in worker processes. global _initializedSparkSession if not _initializedSparkSession: _initializedSparkSession = True - SparkSession.builder.getOrCreate().createDataFrame(pd.DataFrame(np.array([[1,2],[3,4]]))) + SparkSession.builder.getOrCreate().createDataFrame( + pd.DataFrame(np.array([[1, 2], [3, 4]]))) # ----------------------------------------------------------------------------------- if obj_type == 'mlcontext': return sc._jvm.org.apache.sysml.api.mlcontext.MLContext(sc._jsc) elif obj_type == 'dummy': return sc._jvm.org.apache.sysml.utils.SystemMLLoaderUtils() else: - raise ValueError('Incorrect usage: supported values: mlcontext or dummy') + raise ValueError( + 'Incorrect usage: supported values: mlcontext or dummy') + def _getJarFileNames(sc): - import imp, fnmatch + import imp + import fnmatch jar_file_name = '_ignore.jar' java_dir = os.path.join(imp.find_module("systemml")[1], "systemml-java") jar_file_names = [] for file in os.listdir(java_dir): - if fnmatch.fnmatch(file, 'systemml-*-SNAPSHOT.jar') or fnmatch.fnmatch(file, 'systemml-*.jar'): - jar_file_names = jar_file_names + [ os.path.join(java_dir, file) ] + if fnmatch.fnmatch( + file, 'systemml-*-SNAPSHOT.jar') or fnmatch.fnmatch(file, 'systemml-*.jar'): + jar_file_names = jar_file_names + [os.path.join(java_dir, file)] return jar_file_names + def _getLoaderInstance(sc, jar_file_name, className, hint): err_msg = 'Unable to load systemml-*.jar into current pyspark session.' if os.path.isfile(jar_file_name): @@ -158,16 +183,21 @@ def _getLoaderInstance(sc, jar_file_name, className, hint): url_class = sc._jvm.java.net.URL jar_file_url_arr = sc._gateway.new_array(url_class, 1) jar_file_url_arr[0] = jar_file_url - url_class_loader = sc._jvm.java.net.URLClassLoader(jar_file_url_arr, sc._jsc.getClass().getClassLoader()) + url_class_loader = sc._jvm.java.net.URLClassLoader( + jar_file_url_arr, sc._jsc.getClass().getClassLoader()) c1 = sc._jvm.java.lang.Class.forName(className, True, url_class_loader) return c1.newInstance() else: - raise ImportError(err_msg + ' Hint: Download the jar from http://systemml.apache.org/download and ' + hint ) + raise ImportError( + err_msg + + ' Hint: Download the jar from http://systemml.apache.org/download and ' + + hint) + def createJavaObject(sc, obj_type): """ Performs appropriate check if SystemML.jar is available and returns the handle to MLContext object on JVM - + Parameters ---------- sc: SparkContext @@ -182,13 +212,23 @@ def createJavaObject(sc, obj_type): hint = 'Provide the following argument to pyspark: --driver-class-path ' jar_file_names = _getJarFileNames(sc) if len(jar_file_names) != 2: - raise ImportError('Expected only systemml and systemml-extra jars, but found ' + str(jar_file_names)) + raise ImportError( + 'Expected only systemml and systemml-extra jars, but found ' + + str(jar_file_names)) for jar_file_name in jar_file_names: if 'extra' in jar_file_name: - x = _getLoaderInstance(sc, jar_file_name, 'org.apache.sysml.api.dl.Caffe2DMLLoader', hint + 'systemml-*-extra.jar') + x = _getLoaderInstance( + sc, + jar_file_name, + 'org.apache.sysml.api.dl.Caffe2DMLLoader', + hint + 'systemml-*-extra.jar') x.loadCaffe2DML(jar_file_name) else: - x = _getLoaderInstance(sc, jar_file_name, 'org.apache.sysml.utils.SystemMLLoaderUtils', hint + 'systemml-*.jar') + x = _getLoaderInstance( + sc, + jar_file_name, + 'org.apache.sysml.utils.SystemMLLoaderUtils', + hint + 'systemml-*.jar') x.loadSystemML(jar_file_name) try: ret = _createJavaObject(sc, obj_type) http://git-wip-us.apache.org/repos/asf/systemml/blob/9e7ee19a/src/main/python/systemml/converters.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py index aaa9f88..5954a30 100644 --- a/src/main/python/systemml/converters.py +++ b/src/main/python/systemml/converters.py @@ -1,4 +1,4 @@ -#------------------------------------------------------------- +# ------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -17,9 +17,19 @@ # specific language governing permissions and limitations # under the License. # -#------------------------------------------------------------- - -__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convert_caffemodel', 'convert_lmdb_to_jpeg', 'convertToNumPyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF', 'convertImageToNumPyArr', 'getDatasetMean'] +# ------------------------------------------------------------- + +__all__ = [ + 'getNumCols', + 'convertToMatrixBlock', + 'convert_caffemodel', + 'convert_lmdb_to_jpeg', + 'convertToNumPyArr', + 'convertToPandasDF', + 'SUPPORTED_TYPES', + 'convertToLabeledDF', + 'convertImageToNumPyArr', + 'getDatasetMean'] import numpy as np import pandas as pd @@ -32,7 +42,8 @@ from .classloader import * SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix) -DATASET_MEAN = {'VGG_ILSVRC_19_2014':[103.939, 116.779, 123.68]} +DATASET_MEAN = {'VGG_ILSVRC_19_2014': [103.939, 116.779, 123.68]} + def getNumCols(numPyArr): if numPyArr.ndim == 1: @@ -40,9 +51,11 @@ def getNumCols(numPyArr): else: return numPyArr.shape[1] + def get_pretty_str(key, value): return '\t"' + key + '": ' + str(value) + ',\n' + def save_tensor_csv(tensor, file_path, shouldTranspose): w = w.reshape(w.shape[0], -1) if shouldTranspose: @@ -53,9 +66,12 @@ def save_tensor_csv(tensor, file_path, shouldTranspose): file.write(get_pretty_str('rows', w.shape[0])) file.write(get_pretty_str('cols', w.shape[1])) file.write(get_pretty_str('nnz', np.count_nonzero(w))) - file.write('\t"format": "csv",\n\t"description": {\n\t\t"author": "SystemML"\n\t}\n}\n') + file.write( + '\t"format": "csv",\n\t"description": {\n\t\t"author": "SystemML"\n\t}\n}\n') -def convert_caffemodel(sc, deploy_file, caffemodel_file, output_dir, format="binary", is_caffe_installed=False): + +def convert_caffemodel(sc, deploy_file, caffemodel_file, + output_dir, format="binary", is_caffe_installed=False): """ Saves the weights and bias in the caffemodel file to output_dir in the specified format. This method does not requires caffe to be installed. @@ -82,7 +98,10 @@ def convert_caffemodel(sc, deploy_file, caffemodel_file, output_dir, format="bin """ if is_caffe_installed: if format != 'csv': - raise ValueError('The format ' + str(format) + ' is not supported when caffe is installed. Hint: Please specify format=csv') + raise ValueError( + 'The format ' + + str(format) + + ' is not supported when caffe is installed. Hint: Please specify format=csv') import caffe net = caffe.Net(deploy_file, caffemodel_file, caffe.TEST) for layerName in net.params.keys(): @@ -91,21 +110,48 @@ def convert_caffemodel(sc, deploy_file, caffemodel_file, output_dir, format="bin continue elif num_parameters == 2: # Weights and Biases - layerType = net.layers[list(net._layer_names).index(layerName)].type + layerType = net.layers[list( + net._layer_names).index(layerName)].type shouldTranspose = True if layerType == 'InnerProduct' else False - save_tensor_csv(net.params[layerName][0].data, os.path.join(output_dir, layerName + '_weight.mtx'), shouldTranspose) - save_tensor_csv(net.params[layerName][1].data, os.path.join(output_dir, layerName + '_bias.mtx'), shouldTranspose) + save_tensor_csv( + net.params[layerName][0].data, + os.path.join( + output_dir, + layerName + + '_weight.mtx'), + shouldTranspose) + save_tensor_csv( + net.params[layerName][1].data, + os.path.join( + output_dir, + layerName + + '_bias.mtx'), + shouldTranspose) elif num_parameters == 1: # Only Weight - layerType = net.layers[list(net._layer_names).index(layerName)].type + layerType = net.layers[list( + net._layer_names).index(layerName)].type shouldTranspose = True if layerType == 'InnerProduct' else False - save_tensor_csv(net.params[layerName][0].data, os.path.join(output_dir, layerName + '_weight.mtx'), shouldTranspose) + save_tensor_csv( + net.params[layerName][0].data, + os.path.join( + output_dir, + layerName + + '_weight.mtx'), + shouldTranspose) else: - raise ValueError('Unsupported number of parameters:' + str(num_parameters)) + raise ValueError( + 'Unsupported number of parameters:' + + str(num_parameters)) else: createJavaObject(sc, 'dummy') utilObj = sc._jvm.org.apache.sysml.api.dl.Utils() - utilObj.saveCaffeModelFile(sc._jsc, deploy_file, caffemodel_file, output_dir, format) + utilObj.saveCaffeModelFile( + sc._jsc, + deploy_file, + caffemodel_file, + output_dir, + format) def convert_lmdb_to_jpeg(lmdb_img_file, output_dir): @@ -121,7 +167,9 @@ def convert_lmdb_to_jpeg(lmdb_img_file, output_dir): output_dir: string Output directory for images (local filesystem) """ - import lmdb, caffe, cv2 + import lmdb + import caffe + import cv2 lmdb_cursor = lmdb.open(lmdb_file, readonly=True).begin().cursor() datum = caffe.proto.caffe_pb2.Datum() i = 1 @@ -129,7 +177,7 @@ def convert_lmdb_to_jpeg(lmdb_img_file, output_dir): datum.ParseFromString(value) data = caffe.io.datum_to_array(datum) output_file_path = os.path.join(output_dir, 'file_' + str(i) + '.jpg') - image = np.transpose(data, (1,2,0)) # CxHxW to HxWxC in cv2 + image = np.transpose(data, (1, 2, 0)) # CxHxW to HxWxC in cv2 cv2.imwrite(output_file_path, image) i = i + 1 @@ -153,8 +201,9 @@ def convertToLabeledDF(sparkSession, X, y=None): else: return out.select('features') + def _convertSPMatrixToMB(sc, src): - src = coo_matrix(src, dtype=np.float64) + src = coo_matrix(src, dtype=np.float64) numRows = src.shape[0] numCols = src.shape[1] data = src.data @@ -165,7 +214,9 @@ def _convertSPMatrixToMB(sc, src): buf2 = bytearray(row.tostring()) buf3 = bytearray(col.tostring()) createJavaObject(sc, 'dummy') - return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz) + return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB( + buf1, buf2, buf3, numRows, numCols, nnz) + def _convertDenseMatrixToMB(sc, src): numCols = getNumCols(src) @@ -173,15 +224,25 @@ def _convertDenseMatrixToMB(sc, src): arr = src.ravel().astype(np.float64) buf = bytearray(arr.tostring()) createJavaObject(sc, 'dummy') - return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) + return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB( + buf, numRows, numCols) -def _copyRowBlock(i, sc, ret, src, numRowsPerBlock, rlen, clen): + +def _copyRowBlock(i, sc, ret, src, numRowsPerBlock, rlen, clen): rowIndex = int(i / numRowsPerBlock) - tmp = src[i:min(i+numRowsPerBlock, rlen),] - mb = _convertSPMatrixToMB(sc, tmp) if isinstance(src, spmatrix) else _convertDenseMatrixToMB(sc, tmp) - sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.copyRowBlocks(mb, rowIndex, ret, numRowsPerBlock, rlen, clen) + tmp = src[i:min(i + numRowsPerBlock, rlen), ] + mb = _convertSPMatrixToMB( + sc, + tmp) if isinstance( + src, + spmatrix) else _convertDenseMatrixToMB( + sc, + tmp) + sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.copyRowBlocks( + mb, rowIndex, ret, numRowsPerBlock, rlen, clen) return i + def convertToMatrixBlock(sc, src, maxSizeBlockInMB=8): if not isinstance(sc, SparkContext): raise TypeError('sc needs to be of type SparkContext') @@ -189,33 +250,49 @@ def convertToMatrixBlock(sc, src, maxSizeBlockInMB=8): src = np.asarray(src, dtype=np.float64) if not isSparse else src if len(src.shape) != 2: src_type = str(type(src).__name__) - raise TypeError('Expected 2-dimensional ' + src_type + ', instead passed ' + str(len(src.shape)) + '-dimensional ' + src_type) + raise TypeError('Expected 2-dimensional ' + + src_type + + ', instead passed ' + + str(len(src.shape)) + + '-dimensional ' + + src_type) # Ignoring sparsity for computing numRowsPerBlock for now - numRowsPerBlock = int(math.ceil((maxSizeBlockInMB*1000000) / (src.shape[1]*8))) + numRowsPerBlock = int( + math.ceil((maxSizeBlockInMB * 1000000) / (src.shape[1] * 8))) multiBlockTransfer = False if numRowsPerBlock >= src.shape[0] else True if not multiBlockTransfer: - return _convertSPMatrixToMB(sc, src) if isSparse else _convertDenseMatrixToMB(sc, src) + return _convertSPMatrixToMB( + sc, src) if isSparse else _convertDenseMatrixToMB(sc, src) else: # Since coo_matrix does not have range indexing src = csr_matrix(src) if isSparse else src rlen = int(src.shape[0]) clen = int(src.shape[1]) - ret = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.allocateDenseOrSparse(rlen, clen, isSparse) - [ _copyRowBlock(i, sc, ret, src, numRowsPerBlock, rlen, clen) for i in range(0, src.shape[0], numRowsPerBlock) ] - sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.postProcessAfterCopying(ret) + ret = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.allocateDenseOrSparse( + rlen, clen, isSparse) + [_copyRowBlock(i, sc, ret, src, numRowsPerBlock, rlen, clen) + for i in range(0, src.shape[0], numRowsPerBlock)] + sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.postProcessAfterCopying( + ret) return ret + def convertToNumPyArr(sc, mb): if isinstance(sc, SparkContext): numRows = mb.getNumRows() numCols = mb.getNumColumns() createJavaObject(sc, 'dummy') - buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb) - return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64).reshape((numRows, numCols)) + buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr( + mb) + return np.frombuffer(buf, count=numRows * numCols, + dtype=np.float64).reshape((numRows, numCols)) else: - raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves + # TODO: We can generalize this by creating py4j gateway ourselves + raise TypeError('sc needs to be of type SparkContext') # Returns the mean of a model if defined otherwise None + + def getDatasetMean(dataset_name): """ Parameters @@ -230,7 +307,7 @@ def getDatasetMean(dataset_name): try: mean = DATASET_MEAN[dataset_name.upper()] - except: + except BaseException: mean = None return mean @@ -238,14 +315,15 @@ def getDatasetMean(dataset_name): # Example usage: convertImageToNumPyArr(im, img_shape=(3, 224, 224), add_rotated_images=True, add_mirrored_images=True) # The above call returns a numpy array of shape (6, 50176) in NCHW format def convertImageToNumPyArr(im, img_shape=None, add_rotated_images=False, add_mirrored_images=False, - color_mode = 'RGB', mean=None): + color_mode='RGB', mean=None): - ## Input Parameters + # Input Parameters # color_mode: In case of VGG models which expect image data in BGR format instead of RGB for other most models, # color_mode parameter is used to process image data in BGR format. - # mean: mean value is used to subtract from input data from every pixel value. By default value is None, so mean value not subtracted. + # mean: mean value is used to subtract from input data from every pixel + # value. By default value is None, so mean value not subtracted. if img_shape is not None: num_channels = img_shape[0] @@ -275,7 +353,7 @@ def convertImageToNumPyArr(im, img_shape=None, add_rotated_images=False, add_mir # RGB -> BGR if color_mode == 'BGR': - im = im[...,::-1] + im = im[..., ::-1] # Subtract Mean if mean is not None: @@ -288,13 +366,23 @@ def convertImageToNumPyArr(im, img_shape=None, add_rotated_images=False, add_mir ret = _im2NumPy(im) if add_rotated_images: - ret = np.vstack((ret, _im2NumPy(im.rotate(90)), _im2NumPy(im.rotate(180)), _im2NumPy(im.rotate(270)) )) + ret = np.vstack( + (ret, _im2NumPy( + im.rotate(90)), _im2NumPy( + im.rotate(180)), _im2NumPy( + im.rotate(270)))) if add_mirrored_images: - ret = np.vstack((ret, _im2NumPy(im.transpose(Image.FLIP_LEFT_RIGHT)), _im2NumPy(im.transpose(Image.FLIP_TOP_BOTTOM)))) + ret = np.vstack( + (ret, _im2NumPy( + im.transpose( + Image.FLIP_LEFT_RIGHT)), _im2NumPy( + im.transpose( + Image.FLIP_TOP_BOTTOM)))) return ret def convertToPandasDF(X): if not isinstance(X, pd.DataFrame): - return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))]) + return pd.DataFrame(X, columns=['C' + str(i) + for i in range(getNumCols(X))]) return X http://git-wip-us.apache.org/repos/asf/systemml/blob/9e7ee19a/src/main/python/systemml/defmatrix.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py index c177a05..cbdec4d 100644 --- a/src/main/python/systemml/defmatrix.py +++ b/src/main/python/systemml/defmatrix.py @@ -1,4 +1,4 @@ -#------------------------------------------------------------- +# ------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -17,9 +17,19 @@ # specific language governing permissions and limitations # under the License. # -#------------------------------------------------------------- - -__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve', 'DMLOp', 'set_lazy', 'debug_array_conversion', 'load', 'full', 'seq' ] +# ------------------------------------------------------------- + +__all__ = [ + 'setSparkContext', + 'matrix', + 'eval', + 'solve', + 'DMLOp', + 'set_lazy', + 'debug_array_conversion', + 'load', + 'full', + 'seq'] import numpy as np import pandas as pd @@ -31,11 +41,13 @@ try: from pyspark.sql import DataFrame, SparkSession import pyspark.mllib.common except ImportError: - raise ImportError('Unable to import `pyspark`. Hint: Make sure you are running with PySpark.') + raise ImportError( + 'Unable to import `pyspark`. Hint: Make sure you are running with PySpark.') from . import MLContext, pydml, _java2py, Matrix from .converters import * + def setSparkContext(sc): """ Before using the matrix, the user needs to invoke this function if SparkContext is not previously created in the session. @@ -55,43 +67,48 @@ def check_MLContext(): if SparkContext._active_spark_context is not None: setSparkContext(SparkContext._active_spark_context) else: - raise Exception('Expected setSparkContext(sc) to be called, where sc is active SparkContext.') + raise Exception( + 'Expected setSparkContext(sc) to be called, where sc is active SparkContext.') + +########################## AST related operations ######################## -########################## AST related operations ################################## class DMLOp(object): """ Represents an intermediate node of Abstract syntax tree created to generate the PyDML script """ + def __init__(self, inputs, dml=None): self.inputs = inputs self.dml = dml self.ID = None self.depth = 1 for m in self.inputs: - m.referenced = m.referenced + [ self ] + m.referenced = m.referenced + [self] if isinstance(m, matrix) and m.op is not None: self.depth = max(self.depth, m.op.depth + 1) MAX_DEPTH = 0 - + def _visit(self, execute=True): matrix.dml = matrix.dml + self.dml def _print_ast(self, numSpaces): ret = [] for m in self.inputs: - ret = [ m._print_ast(numSpaces+2) ] + ret = [m._print_ast(numSpaces + 2)] return ''.join(ret) + # Special object used internally to specify the placeholder which will be replaced by output ID # This helps to provide dml containing output ID in construct_intermediate_node OUTPUT_ID = '$$OutputID$$' + def set_lazy(isLazy): """ This method allows users to set whether the matrix operations should be executed in lazy manner. - + Parameters ---------- isLazy: True if matrix operations should be evaluated in lazy manner. @@ -100,7 +117,8 @@ def set_lazy(isLazy): DMLOp.MAX_DEPTH = 0 else: DMLOp.MAX_DEPTH = 1 - + + def construct_intermediate_node(inputs, dml): """ Convenient utility to create an intermediate node of AST. @@ -112,11 +130,12 @@ def construct_intermediate_node(inputs, dml): """ dmlOp = DMLOp(inputs) out = matrix(None, op=dmlOp) - dmlOp.dml = [out.ID if x==OUTPUT_ID else x for x in dml] + dmlOp.dml = [out.ID if x == OUTPUT_ID else x for x in dml] if DMLOp.MAX_DEPTH > 0 and out.op.depth >= DMLOp.MAX_DEPTH: out.eval() return out + def load(file, format='csv'): """ Allows user to load a matrix from filesystem @@ -126,7 +145,9 @@ def load(file, format='csv'): file: filepath format: can be csv, text or binary or mm """ - return construct_intermediate_node([], [OUTPUT_ID, ' = load(\"', file, '\", format=\"', format, '\")\n']) + return construct_intermediate_node( + [], [OUTPUT_ID, ' = load(\"', file, '\", format=\"', format, '\")\n']) + def full(shape, fill_value): """ @@ -137,7 +158,9 @@ def full(shape, fill_value): shape: tuple of length 2 fill_value: float or int """ - return construct_intermediate_node([], [OUTPUT_ID, ' = full(', str(fill_value), ', rows=', str(shape[0]), ', cols=', str(shape[1]), ')\n']) + return construct_intermediate_node([], [OUTPUT_ID, ' = full(', str( + fill_value), ', rows=', str(shape[0]), ', cols=', str(shape[1]), ')\n']) + def reset(): """ @@ -150,6 +173,7 @@ def reset(): matrix.dml = [] matrix.script = pydml('') + def perform_dfs(outputs, execute): """ Traverses the forest of nodes rooted at outputs nodes and returns the DML script to execute @@ -167,8 +191,9 @@ def perform_dfs(outputs, execute): def _log_base(val, base): if not isinstance(val, str): raise ValueError('The val to _log_base should be of type string') - return '(log(' + val + ')/log(' + str(base) + '))' - + return '(log(' + val + ')/log(' + str(base) + '))' + + def _matricize(lhs, inputs): """ Utility fn to convert the supported types to matrix class or to string (if float or int) @@ -184,7 +209,8 @@ def _matricize(lhs, inputs): else: raise TypeError('Incorrect type') return lhsStr, inputs - + + def binary_op(lhs, rhs, opStr): """ Common function called by all the binary operators in matrix class @@ -192,7 +218,9 @@ def binary_op(lhs, rhs, opStr): inputs = [] lhsStr, inputs = _matricize(lhs, inputs) rhsStr, inputs = _matricize(rhs, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', lhsStr, opStr, rhsStr, '\n']) + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = ', lhsStr, opStr, rhsStr, '\n']) + def binaryMatrixFunction(X, Y, fnName): """ @@ -201,7 +229,9 @@ def binaryMatrixFunction(X, Y, fnName): inputs = [] lhsStr, inputs = _matricize(X, inputs) rhsStr, inputs = _matricize(Y, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', fnName,'(', lhsStr, ', ', rhsStr, ')\n']) + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = ', fnName, '(', lhsStr, ', ', rhsStr, ')\n']) + def unaryMatrixFunction(X, fnName): """ @@ -209,14 +239,16 @@ def unaryMatrixFunction(X, fnName): """ inputs = [] lhsStr, inputs = _matricize(X, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', fnName,'(', lhsStr, ')\n']) + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = ', fnName, '(', lhsStr, ')\n']) + def seq(start=None, stop=None, step=1): """ Creates a single column vector with values starting from <start>, to <stop>, in increments of <step>. Note: Unlike Numpy's arange which returns a row-vector, this returns a column vector. Also, Unlike Numpy's arange which doesnot include stop, this method includes stop in the interval. - + Parameters ---------- start: int or float [Optional: default = 0] @@ -228,9 +260,12 @@ def seq(start=None, stop=None, step=1): elif start is not None and stop is None: stop = start start = 0 - return construct_intermediate_node([], [OUTPUT_ID, ' = seq(', str(start), ',', str(stop), ',', str(step), ')\n']) - + return construct_intermediate_node( + [], [OUTPUT_ID, ' = seq(', str(start), ',', str(stop), ',', str(step), ')\n']) + # utility function that converts 1:3 into DML string + + def convert_seq_to_dml(s): ret = [] if s is None: @@ -239,31 +274,36 @@ def convert_seq_to_dml(s): if s.step is not None: raise ValueError('Slicing with step is not supported.') if s.start is None: - ret = ret + [ '0 : ' ] + ret = ret + ['0 : '] else: - ret = ret + [ getValue(s.start), ':' ] + ret = ret + [getValue(s.start), ':'] if s.start is None: - ret = ret + [ '' ] + ret = ret + [''] else: - ret = ret + [ getValue(s.stop) ] + ret = ret + [getValue(s.stop)] else: - ret = ret + [ getValue(s) ] + ret = ret + [getValue(s)] return ''.join(ret) # utility function that converts index (such as [1, 2:3]) into DML string + + def getIndexingDML(index): - ret = [ '[' ] + ret = ['['] if isinstance(index, tuple) and len(index) == 1: - ret = ret + [ convert_seq_to_dml(index[0]), ',' ] + ret = ret + [convert_seq_to_dml(index[0]), ','] elif isinstance(index, tuple) and len(index) == 2: - ret = ret + [ convert_seq_to_dml(index[0]), ',', convert_seq_to_dml(index[1]) ] + ret = ret + [convert_seq_to_dml(index[0]), + ',', convert_seq_to_dml(index[1])] else: - raise TypeError('matrix indexes can only be tuple of length 2. For example: m[1,1], m[0:1,], m[:, 0:1]') - return ret + [ ']' ] + raise TypeError( + 'matrix indexes can only be tuple of length 2. For example: m[1,1], m[0:1,], m[:, 0:1]') + return ret + [']'] + def convert_outputs_to_list(outputs): if isinstance(outputs, matrix): - return [ outputs ] + return [outputs] elif isinstance(outputs, list): for o in outputs: if not isinstance(o, matrix): @@ -272,10 +312,11 @@ def convert_outputs_to_list(outputs): else: raise TypeError('Only matrix or list of matrix allowed') + def reset_output_flag(outputs): for m in outputs: m.output = False - + ############################################################################### @@ -309,6 +350,7 @@ def solve(A, b): """ return binaryMatrixFunction(A, b, 'solve') + def eval(outputs, execute=True): """ Executes the unevaluated DML script and computes the matrices specified by outputs. @@ -333,35 +375,37 @@ def eval(outputs, execute=True): def debug_array_conversion(throwError): matrix.THROW_ARRAY_CONVERSION_ERROR = throwError - + + def _get_new_var_id(): matrix.systemmlVarID += 1 return 'mVar' + str(matrix.systemmlVarID) ############################################################################### + class matrix(object): """ matrix class is a python wrapper that implements basic matrix operators, matrix functions as well as converters to common Python types (for example: Numpy arrays, PySpark DataFrame - and Pandas DataFrame). - + and Pandas DataFrame). + The operators supported are: - + 1. Arithmetic operators: +, -, *, /, //, %, ** as well as dot (i.e. matrix multiplication) 2. Indexing in the matrix 3. Relational/Boolean operators: <, <=, >, >=, ==, !=, &, | - + In addition, following functions are supported for matrix: - + 1. transpose 2. Aggregation functions: sum, mean, var, sd, max, min, argmin, argmax, cumsum 3. Global statistical built-In functions: exp, log, abs, sqrt, round, floor, ceil, ceiling, sin, cos, tan, asin, acos, atan, sign, solve - - For all the above functions, we always return a two dimensional matrix, especially for aggregation functions with axis. + + For all the above functions, we always return a two dimensional matrix, especially for aggregation functions with axis. For example: Assuming m1 is a matrix of (3, n), NumPy returns a 1d vector of dimension (3,) for operation m1.sum(axis=1) whereas SystemML returns a 2d matrix of dimension (3, 1). - + Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array. Examples @@ -396,9 +440,9 @@ class matrix(object): array([[-60.], [-60.], [-60.]]) - + Design Decisions: - + 1. Until eval() method is invoked, we create an AST (not exposed to the user) that consist of unevaluated operations and data required by those operations. As an anology, a spark user can treat eval() method similar to calling RDD.persist() followed by RDD.count(). 2. The AST consist of two kinds of nodes: either of type matrix or of type DMLOp. @@ -408,7 +452,7 @@ class matrix(object): If not evaluated, the attribute 'op' which refers to one of the intermediate node of AST and if of type DMLOp. In this case, the attribute 'data' is set to None. 4. DMLOp has an attribute 'inputs' which contains list of matrix objects or DMLOp. 5. To simplify the traversal, every matrix object is considered immutable and an matrix operations creates a new matrix object. - As an example: + As an example: `m1 = sml.matrix(np.ones((3,3)))` creates a matrix object backed by 'data=(np.ones((3,3))'. `m1 = m1 * 2` will create a new matrix object which is now backed by 'op=DMLOp( ... )' whose input is earlier created matrix object. 6. Left indexing (implemented in __setitem__ method) is a special case, where Python expects the existing object to be mutated. @@ -417,7 +461,7 @@ class matrix(object): left-indexed-matrix = new-deep-copied-matrix left-indexed-matrix[index] = value 7. Please use m.print_ast() and/or type `m` for debugging. Here is a sample session: - + >>> npm = np.ones((3,3)) >>> m1 = sml.matrix(npm + 3) >>> m2 = sml.matrix(npm + 5) @@ -430,14 +474,16 @@ class matrix(object): >>> m3.print_ast() - [mVar3] (op). - [mVar1] (data). - - [mVar2] (data). - + - [mVar2] (data). + """ - # Global variable that is used to keep track of intermediate matrix variables in the DML script + # Global variable that is used to keep track of intermediate matrix + # variables in the DML script systemmlVarID = 0 # Since joining of string is expensive operation, we collect the set of strings into list and then join - # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method + # them before execution: See matrix.script.scriptString = + # ''.join(matrix.dml) in eval() method dml = [] # Represents MLContext's script object @@ -449,7 +495,7 @@ class matrix(object): # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects # that have been previously evaluated. visited = [] - + def __init__(self, data, op=None): """ Constructs a lazy matrix @@ -464,13 +510,15 @@ class matrix(object): self.output = False self.ID = _get_new_var_id() self.referenced = [] - # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation + # op refers to the node of Abstract Syntax Tree created internally for + # lazy evaluation self.op = op self.eval_data = data self._shape = None if isinstance(data, SUPPORTED_TYPES): self._shape = data.shape - if not (isinstance(data, SUPPORTED_TYPES) or hasattr(data, '_jdf') or (data is None and op is not None)): + if not (isinstance(data, SUPPORTED_TYPES) or hasattr( + data, '_jdf') or (data is None and op is not None)): raise TypeError('Unsupported input type') def eval(self): @@ -478,14 +526,15 @@ class matrix(object): This is a convenience function that calls the global eval method """ eval([self]) - + def toPandas(self): """ This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame. """ self.eval() if isinstance(self.eval_data, py4j.java_gateway.JavaObject): - self.eval_data = _java2py(SparkContext._active_spark_context, self.eval_data) + self.eval_data = _java2py( + SparkContext._active_spark_context, self.eval_data) if isinstance(self.eval_data, Matrix): self.eval_data = self.eval_data.toNumPy() self.eval_data = convertToPandasDF(self.eval_data) @@ -497,7 +546,8 @@ class matrix(object): """ self.eval() if isinstance(self.eval_data, py4j.java_gateway.JavaObject): - self.eval_data = _java2py(SparkContext._active_spark_context, self.eval_data) + self.eval_data = _java2py( + SparkContext._active_spark_context, self.eval_data) if isinstance(self.eval_data, Matrix): self.eval_data = self.eval_data.toNumPy() return self.eval_data @@ -519,7 +569,8 @@ class matrix(object): if isinstance(self.eval_data, DataFrame): return self.eval_data if isinstance(self.eval_data, py4j.java_gateway.JavaObject): - self.eval_data = _java2py(SparkContext._active_spark_context, self.eval_data) + self.eval_data = _java2py( + SparkContext._active_spark_context, self.eval_data) if isinstance(self.eval_data, Matrix): self.eval_data = self.eval_data.toDF() return self.eval_data @@ -529,36 +580,40 @@ class matrix(object): def save(self, file, format='csv'): """ Allows user to save a matrix to filesystem - + Parameters ---------- file: filepath format: can be csv, text or binary or mm """ - tmp = construct_intermediate_node([self], ['save(', self.ID , ',\"', file, '\", format=\"', format, '\")\n']) - construct_intermediate_node([tmp], [OUTPUT_ID, ' = full(0, rows=1, cols=1)\n']).eval() - + tmp = construct_intermediate_node( + [self], ['save(', self.ID, ',\"', file, '\", format=\"', format, '\")\n']) + construct_intermediate_node( + [tmp], [OUTPUT_ID, ' = full(0, rows=1, cols=1)\n']).eval() + def _mark_as_visited(self): self.visited = True # for cleanup - matrix.visited = matrix.visited + [ self ] + matrix.visited = matrix.visited + [self] return self def _register_as_input(self, execute): # TODO: Remove this when automatic registration of frame is resolved - matrix.dml = [ self.ID, ' = load(\" \", format=\"csv\")\n'] + matrix.dml + matrix.dml = [self.ID, ' = load(\" \", format=\"csv\")\n'] + matrix.dml if isinstance(self.eval_data, SUPPORTED_TYPES) and execute: - matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.eval_data)) + matrix.script.input( + self.ID, convertToMatrixBlock( + matrix.sc, self.eval_data)) elif execute: matrix.script.input(self.ID, self.toDF()) return self def _register_as_output(self, execute): # TODO: Remove this when automatic registration of frame is resolved - matrix.dml = matrix.dml + ['save(', self.ID, ', \" \")\n'] + matrix.dml = matrix.dml + ['save(', self.ID, ', \" \")\n'] if execute: matrix.script.output(self.ID) - + def _visit(self, execute=True): """ This function is called for two scenarios: @@ -585,7 +640,7 @@ class matrix(object): def print_ast(self): """ Please use m.print_ast() and/or type `m` for debugging. Here is a sample session: - + >>> npm = np.ones((3,3)) >>> m1 = sml.matrix(npm + 3) >>> m2 = sml.matrix(npm + 5) @@ -601,15 +656,15 @@ class matrix(object): - [mVar2] (data). """ return self._print_ast(0) - + def _print_ast(self, numSpaces): - head = ''.join([ ' ' ]*numSpaces + [ '- [', self.ID, '] ' ]) + head = ''.join([' '] * numSpaces + ['- [', self.ID, '] ']) if self.eval_data is not None: out = head + '(data).\n' elif self.op is not None: - ret = [ head, '(op).\n' ] + ret = [head, '(op).\n'] for m in self.op.inputs: - ret = ret + [ m._print_ast(numSpaces + 2) ] + ret = ret + [m._print_ast(numSpaces + 2)] out = ''.join(ret) else: raise ValueError('Either op or data needs to be set') @@ -623,66 +678,81 @@ class matrix(object): This function helps to debug matrix class and also examine the generated PyDML script """ if self.eval_data is None: - print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPy() or toDF() or toPandas() methods.\n' + eval([self], execute=False)) + print( + '# This matrix (' + + self.ID + + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPy() or toDF() or toPandas() methods.\n' + + eval( + [self], + execute=False)) else: - print('# This matrix (' + self.ID + ') is backed by ' + str(type(self.eval_data)) + '. To fetch the DataFrame or NumPy array, invoke toDF() or toNumPy() method respectively.') + print('# This matrix (' + self.ID + ') is backed by ' + str(type(self.eval_data)) + + '. To fetch the DataFrame or NumPy array, invoke toDF() or toNumPy() method respectively.') return '' - - ######################### NumPy related methods ###################################### - + + ######################### NumPy related methods ########################## + __array_priority__ = 10.2 ndim = 2 - + THROW_ARRAY_CONVERSION_ERROR = False - + def __array__(self, dtype=np.double): """ As per NumPy from Python, This method is called to obtain an ndarray object when needed. You should always guarantee this returns an actual ndarray object. - + Using this method, you get back a ndarray object, and subsequent operations on the returned ndarray object will be singlenode. """ if not isinstance(self.eval_data, SUPPORTED_TYPES): - # Only warn if there is an unevaluated operation (which could potentially generate large matrix or if data is non-supported singlenode formats) + # Only warn if there is an unevaluated operation (which could + # potentially generate large matrix or if data is non-supported + # singlenode formats) import inspect - frame,filename,line_number,function_name,lines,index = inspect.stack()[1] - msg = 'Conversion from SystemML matrix to NumPy array (occurs in ' + str(filename) + ':' + str(line_number) + ' ' + function_name + ")" + frame, filename, line_number, function_name, lines, index = inspect.stack()[ + 1] + msg = 'Conversion from SystemML matrix to NumPy array (occurs in ' + str( + filename) + ':' + str(line_number) + ' ' + function_name + ")" if matrix.THROW_ARRAY_CONVERSION_ERROR: raise Exception('[ERROR]:' + msg) else: print('[WARN]:' + msg) return np.array(self.toNumPy(), dtype) - + def astype(self, t): # TODO: Throw error if incorrect type return self - + def asfptype(self): return self - - def set_shape(self,shape): + + def set_shape(self, shape): raise NotImplementedError('Reshaping is not implemented') - + def get_shape(self): if self._shape is None: lhsStr, inputs = _matricize(self, []) rlen_ID = _get_new_var_id() clen_ID = _get_new_var_id() multiline_dml = [rlen_ID, ' = ', lhsStr, '.shape(0)\n'] - multiline_dml = multiline_dml + [clen_ID, ' = ', lhsStr, '.shape(1)\n'] - multiline_dml = multiline_dml + [OUTPUT_ID, ' = full(0, rows=2, cols=1)\n'] - multiline_dml = multiline_dml + [ OUTPUT_ID, '[0,0] = ', rlen_ID, '\n' ] - multiline_dml = multiline_dml + [ OUTPUT_ID, '[1,0] = ', clen_ID, '\n' ] + multiline_dml = multiline_dml + \ + [clen_ID, ' = ', lhsStr, '.shape(1)\n'] + multiline_dml = multiline_dml + \ + [OUTPUT_ID, ' = full(0, rows=2, cols=1)\n'] + multiline_dml = multiline_dml + \ + [OUTPUT_ID, '[0,0] = ', rlen_ID, '\n'] + multiline_dml = multiline_dml + \ + [OUTPUT_ID, '[1,0] = ', clen_ID, '\n'] ret = construct_intermediate_node(inputs, multiline_dml).toNumPy() self._shape = tuple(np.array(ret, dtype=int).flatten()) - return self._shape - + return self._shape + shape = property(fget=get_shape, fset=set_shape) - + def __numpy_ufunc__(self, func, method, pos, inputs, **kwargs): """ This function enables systemml matrix to be compatible with NumPy's ufuncs. - + Parameters ---------- func: ufunc object that was called. @@ -699,7 +769,7 @@ class matrix(object): return NotImplemented if len(inputs) == 2: return fn(inputs[0], inputs[1]) - elif len(inputs) == 1: + elif len(inputs) == 1: return fn(inputs[0]) else: raise ValueError('Unsupported number of inputs') @@ -709,98 +779,111 @@ class matrix(object): Stack matrices horizontally (column wise). Invokes cbind internally. """ return binaryMatrixFunction(self, other, 'cbind') - + def vstack(self, other): """ Stack matrices vertically (row wise). Invokes rbind internally. """ return binaryMatrixFunction(self, other, 'rbind') - - ######################### Arithmetic operators ###################################### + + ######################### Arithmetic operators ########################### def negative(self): lhsStr, inputs = _matricize(self, []) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = -', lhsStr, '\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = -', lhsStr, '\n']) + def remainder(self, other): inputs = [] lhsStr, inputs = _matricize(self, inputs) rhsStr, inputs = _matricize(other, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = floor(', lhsStr, '/', rhsStr, ') * ', rhsStr, '\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = floor(', lhsStr, '/', rhsStr, ') * ', rhsStr, '\n']) + def ldexp(self, other): inputs = [] lhsStr, inputs = _matricize(self, inputs) rhsStr, inputs = _matricize(other, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', lhsStr, '* (2**', rhsStr, ')\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = ', lhsStr, '* (2**', rhsStr, ')\n']) + def mod(self, other): inputs = [] lhsStr, inputs = _matricize(self, inputs) rhsStr, inputs = _matricize(other, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', lhsStr, ' - floor(', lhsStr, '/', rhsStr, ') * ', rhsStr, '\n']) - + return construct_intermediate_node(inputs, [ + OUTPUT_ID, ' = ', lhsStr, ' - floor(', lhsStr, '/', rhsStr, ') * ', rhsStr, '\n']) + def logaddexp(self, other): inputs = [] lhsStr, inputs = _matricize(self, inputs) rhsStr, inputs = _matricize(other, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = log(exp(', lhsStr, ') + exp(', rhsStr, '))\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = log(exp(', lhsStr, ') + exp(', rhsStr, '))\n']) + def logaddexp2(self, other): inputs = [] lhsStr, inputs = _matricize(self, inputs) rhsStr, inputs = _matricize(other, inputs) - opStr = _log_base('2**' + lhsStr + '2**' + rhsStr, 2) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', opStr, '\n']) + opStr = _log_base('2**' + lhsStr + '2**' + rhsStr, 2) + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = ', opStr, '\n']) def log1p(self): inputs = [] lhsStr, inputs = _matricize(self, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = log(1 + ', lhsStr, ')\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = log(1 + ', lhsStr, ')\n']) + def exp(self): return unaryMatrixFunction(self, 'exp') def exp2(self): inputs = [] lhsStr, inputs = _matricize(self, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = 2**', lhsStr, '\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = 2**', lhsStr, '\n']) + def square(self): inputs = [] lhsStr, inputs = _matricize(self, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', lhsStr, '**2\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = ', lhsStr, '**2\n']) + def reciprocal(self): inputs = [] lhsStr, inputs = _matricize(self, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = 1/', lhsStr, '\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = 1/', lhsStr, '\n']) + def expm1(self): inputs = [] lhsStr, inputs = _matricize(self, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = exp(', lhsStr, ') - 1\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = exp(', lhsStr, ') - 1\n']) + def ones_like(self): inputs = [] lhsStr, inputs = _matricize(self, inputs) rlen = lhsStr + '.shape(axis=0)' clen = lhsStr + '.shape(axis=1)' - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = full(1, rows=', rlen, ', cols=', clen, ')\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = full(1, rows=', rlen, ', cols=', clen, ')\n']) + def zeros_like(self): inputs = [] lhsStr, inputs = _matricize(self, inputs) rlen = lhsStr + '.shape(axis=0)' clen = lhsStr + '.shape(axis=1)' - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = full(0, rows=', rlen, ', cols=', clen, ')\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = full(0, rows=', rlen, ', cols=', clen, ')\n']) + def log2(self): return self.log(2) - + def log10(self): return self.log(10) - + def log(self, y=None): if y is None: return unaryMatrixFunction(self, 'log') @@ -833,7 +916,7 @@ class matrix(object): def tan(self): return unaryMatrixFunction(self, 'tan') - + def sinh(self): return unaryMatrixFunction(self, 'sinh') @@ -842,7 +925,7 @@ class matrix(object): def tanh(self): return unaryMatrixFunction(self, 'tanh') - + def arcsin(self): return self.asin() @@ -851,7 +934,7 @@ class matrix(object): def arctan(self): return self.atan() - + def asin(self): return unaryMatrixFunction(self, 'asin') @@ -868,8 +951,9 @@ class matrix(object): inputs = [] lhsStr, inputs = _matricize(self, inputs) # 180/pi = 57.2957795131 - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', lhsStr, '*57.2957795131\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = ', lhsStr, '*57.2957795131\n']) + def deg2rad(self): """ Convert angles from degrees to radians. @@ -877,10 +961,11 @@ class matrix(object): inputs = [] lhsStr, inputs = _matricize(self, inputs) # pi/180 = 0.01745329251 - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = ', lhsStr, '*0.01745329251\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = ', lhsStr, '*0.01745329251\n']) + def sign(self): - return unaryMatrixFunction(self, 'sign') + return unaryMatrixFunction(self, 'sign') def __add__(self, other): return binary_op(self, other, ' + ') @@ -951,8 +1036,7 @@ class matrix(object): """ return binaryMatrixFunction(self, other, 'dot') - - ######################### Relational/Boolean operators ###################################### + ######################### Relational/Boolean operators ################### def __lt__(self, other): return binary_op(self, other, ' < ') @@ -964,14 +1048,14 @@ class matrix(object): return binary_op(self, other, ' > ') def __ge__(self, other): - return binary_op(self, other,' >= ') + return binary_op(self, other, ' >= ') def __eq__(self, other): return binary_op(self, other, ' == ') def __ne__(self, other): return binary_op(self, other, ' != ') - + # TODO: Cast the output back into scalar and return boolean results def __and__(self, other): return binary_op(other, self, ' & ') @@ -982,12 +1066,13 @@ class matrix(object): def logical_not(self): inputs = [] lhsStr, inputs = _matricize(self, inputs) - return construct_intermediate_node(inputs, [OUTPUT_ID, ' = !', lhsStr, '\n']) - + return construct_intermediate_node( + inputs, [OUTPUT_ID, ' = !', lhsStr, '\n']) + def remove_empty(self, axis=None): """ Removes all empty rows or columns from the input matrix target X according to specified axis. - + Parameters ---------- axis : int (0 or 1) @@ -995,16 +1080,19 @@ class matrix(object): if axis is None: raise ValueError('axis is a mandatory argument for remove_empty') if axis == 0: - return self._parameterized_helper_fn(self, 'removeEmpty', { 'target':self, 'margin':'rows' }) + return self._parameterized_helper_fn( + self, 'removeEmpty', {'target': self, 'margin': 'rows'}) elif axis == 1: - return self._parameterized_helper_fn(self, 'removeEmpty', { 'target':self, 'margin':'cols' }) + return self._parameterized_helper_fn( + self, 'removeEmpty', {'target': self, 'margin': 'cols'}) else: - raise ValueError('axis for remove_empty needs to be either 0 or 1.') - + raise ValueError( + 'axis for remove_empty needs to be either 0 or 1.') + def replace(self, pattern=None, replacement=None): """ Removes all empty rows or columns from the input matrix target X according to specified axis. - + Parameters ---------- pattern : float or int @@ -1014,43 +1102,44 @@ class matrix(object): raise ValueError('pattern should be of type float or int') if replacement is None or not isinstance(replacement, (float, int)): raise ValueError('replacement should be of type float or int') - return self._parameterized_helper_fn(self, 'replace', { 'target':self, 'pattern':pattern, 'replacement':replacement }) - + return self._parameterized_helper_fn( + self, 'replace', {'target': self, 'pattern': pattern, 'replacement': replacement}) + def _parameterized_helper_fn(self, fnName, **kwargs): """ Helper to invoke parameterized builtin function """ dml_script = '' lhsStr, inputs = _matricize(self, []) - dml_script = [OUTPUT_ID, ' = ', fnName, '(', lhsStr ] + dml_script = [OUTPUT_ID, ' = ', fnName, '(', lhsStr] first_arg = True for key in kwargs: if first_arg: first_arg = False else: - dml_script = dml_script + [ ', ' ] + dml_script = dml_script + [', '] v = kwargs[key] if isinstance(v, str): - dml_script = dml_script + [key, '=\"', v, '\"' ] + dml_script = dml_script + [key, '=\"', v, '\"'] elif isinstance(v, matrix): dml_script = dml_script + [key, '=', v.ID] else: - dml_script = dml_script + [key, '=', str(v) ] - dml_script = dml_script + [ ')\n' ] + dml_script = dml_script + [key, '=', str(v)] + dml_script = dml_script + [')\n'] return construct_intermediate_node(inputs, dml_script) - - ######################### Aggregation functions ###################################### + + ######################### Aggregation functions ########################## def prod(self): """ Return the product of all cells in matrix """ return self._aggFn('prod', None) - + def sum(self, axis=None): """ Compute the sum along the specified axis - + Parameters ---------- axis : int, optional @@ -1060,7 +1149,7 @@ class matrix(object): def mean(self, axis=None): """ Compute the arithmetic mean along the specified axis - + Parameters ---------- axis : int, optional @@ -1071,17 +1160,17 @@ class matrix(object): """ Compute the variance along the specified axis. We assume that delta degree of freedom is 1 (unlike NumPy which assumes ddof=0). - + Parameters ---------- axis : int, optional """ return self._aggFn('var', axis) - + def moment(self, moment=1, axis=None): """ Calculates the nth moment about the mean - + Parameters ---------- moment : int @@ -1095,31 +1184,60 @@ class matrix(object): elif moment == 3 or moment == 4: return self._moment_helper(moment, axis) else: - raise ValueError('The specified moment is not supported:' + str(moment)) - + raise ValueError( + 'The specified moment is not supported:' + + str(moment)) + def _moment_helper(self, k, axis=0): dml_script = '' lhsStr, inputs = _matricize(self, []) - dml_script = [OUTPUT_ID, ' = moment(', lhsStr, ', ', str(k), ')\n' ] - dml_script = [OUTPUT_ID, ' = moment(', lhsStr, ', ', str(k), ')\n' ] + dml_script = [OUTPUT_ID, ' = moment(', lhsStr, ', ', str(k), ')\n'] + dml_script = [OUTPUT_ID, ' = moment(', lhsStr, ', ', str(k), ')\n'] if axis is None: - dml_script = [OUTPUT_ID, ' = moment(full(', lhsStr, ', rows=length(', lhsStr, '), cols=1), ', str(k), ')\n' ] + dml_script = [ + OUTPUT_ID, + ' = moment(full(', + lhsStr, + ', rows=length(', + lhsStr, + '), cols=1), ', + str(k), + ')\n'] elif axis == 0: - dml_script = [OUTPUT_ID, ' = full(0, rows=nrow(', lhsStr, '), cols=1)\n' ] - dml_script = dml_script + [ 'parfor(i in 1:nrow(', lhsStr, '), check=0):\n' ] - dml_script = dml_script + [ '\t', OUTPUT_ID, '[i-1, 0] = moment(full(', lhsStr, '[i-1,], rows=ncol(', lhsStr, '), cols=1), ', str(k), ')\n\n' ] + dml_script = [ + OUTPUT_ID, ' = full(0, rows=nrow(', lhsStr, '), cols=1)\n'] + dml_script = dml_script + \ + ['parfor(i in 1:nrow(', lhsStr, '), check=0):\n'] + dml_script = dml_script + ['\t', + OUTPUT_ID, + '[i-1, 0] = moment(full(', + lhsStr, + '[i-1,], rows=ncol(', + lhsStr, + '), cols=1), ', + str(k), + ')\n\n'] elif axis == 1: - dml_script = [OUTPUT_ID, ' = full(0, rows=1, cols=ncol(', lhsStr, '))\n' ] - dml_script = dml_script + [ 'parfor(i in 1:ncol(', lhsStr, '), check=0):\n' ] - dml_script = dml_script + [ '\t', OUTPUT_ID, '[0, i-1] = moment(', lhsStr, '[,i-1], ', str(k), ')\n\n' ] + dml_script = [ + OUTPUT_ID, ' = full(0, rows=1, cols=ncol(', lhsStr, '))\n'] + dml_script = dml_script + \ + ['parfor(i in 1:ncol(', lhsStr, '), check=0):\n'] + dml_script = dml_script + \ + ['\t', + OUTPUT_ID, + '[0, i-1] = moment(', + lhsStr, + '[,i-1], ', + str(k), + ')\n\n'] else: raise ValueError('Incorrect axis:' + axis) return construct_intermediate_node(inputs, dml_script) - + def sd(self, axis=None): """ Compute the standard deviation along the specified axis - + Parameters ---------- axis : int, optional @@ -1129,7 +1247,7 @@ class matrix(object): def max(self, other=None, axis=None): """ Compute the maximum value along the specified axis - + Parameters ---------- other: matrix or numpy array (& other supported types) or scalar @@ -1145,7 +1263,7 @@ class matrix(object): def min(self, other=None, axis=None): """ Compute the minimum value along the specified axis - + Parameters ---------- other: matrix or numpy array (& other supported types) or scalar @@ -1161,7 +1279,7 @@ class matrix(object): def argmin(self, axis=None): """ Returns the indices of the minimum values along an axis. - + Parameters ---------- axis : int, optional (only axis=1, i.e. rowIndexMax is supported in this version) @@ -1171,7 +1289,7 @@ class matrix(object): def argmax(self, axis=None): """ Returns the indices of the maximum values along an axis. - + Parameters ---------- axis : int, optional (only axis=1, i.e. rowIndexMax is supported in this version) @@ -1181,7 +1299,7 @@ class matrix(object): def cumsum(self, axis=None): """ Returns the indices of the maximum values along an axis. - + Parameters ---------- axis : int, optional (only axis=0, i.e. cumsum along the rows is supported in this version) @@ -1209,26 +1327,29 @@ class matrix(object): if axis is None: dml_script = [OUTPUT_ID, ' = ', fnName, '(', lhsStr, ')\n'] else: - dml_script = [OUTPUT_ID, ' = ', fnName, '(', lhsStr, ', axis=', str(axis) ,')\n'] + dml_script = [OUTPUT_ID, ' = ', fnName, + '(', lhsStr, ', axis=', str(axis), ')\n'] return construct_intermediate_node(inputs, dml_script) - ######################### Indexing operators ###################################### + ######################### Indexing operators ############################# def __getitem__(self, index): """ Implements evaluation of right indexing operations such as m[1,1], m[0:1,], m[:, 0:1] """ - return construct_intermediate_node([self], [OUTPUT_ID, ' = ', self.ID ] + getIndexingDML(index) + [ '\n' ]) + return construct_intermediate_node( + [self], [OUTPUT_ID, ' = ', self.ID] + getIndexingDML(index) + ['\n']) # Performs deep copy if the matrix is backed by data def _prepareForInPlaceUpdate(self): temp = matrix(self.eval_data, op=self.op) for op in self.referenced: - op.inputs = [temp if x.ID==self.ID else x for x in op.inputs] - self.ID, temp.ID = temp.ID, self.ID # Copy even the IDs as the IDs might be used to create DML + op.inputs = [temp if x.ID == self.ID else x for x in op.inputs] + # Copy even the IDs as the IDs might be used to create DML + self.ID, temp.ID = temp.ID, self.ID self.op = DMLOp([temp], dml=[self.ID, " = ", temp.ID]) self.eval_data = None - temp.referenced = self.referenced + [ self.op ] + temp.referenced = self.referenced + [self.op] self.referenced = [] def __setitem__(self, index, value): @@ -1237,10 +1358,64 @@ class matrix(object): """ self._prepareForInPlaceUpdate() if isinstance(value, matrix) or isinstance(value, DMLOp): - self.op.inputs = self.op.inputs + [ value ] + self.op.inputs = self.op.inputs + [value] if isinstance(value, matrix): - value.referenced = value.referenced + [ self.op ] - self.op.dml = self.op.dml + [ '\n', self.ID ] + getIndexingDML(index) + [ ' = ', getValue(value), '\n'] - - # Not implemented: conj, hyperbolic/inverse-hyperbolic functions(i.e. sinh, arcsinh, cosh, ...), bitwise operator, xor operator, isreal, iscomplex, isfinite, isinf, isnan, copysign, nextafter, modf, frexp, trunc - _numpy_to_systeml_mapping = {np.add: __add__, np.subtract: __sub__, np.multiply: __mul__, np.divide: __div__, np.logaddexp: logaddexp, np.true_divide: __truediv__, np.floor_divide: __floordiv__, np.negative: negative, np.power: __pow__, np.remainder: remainder, np.mod: mod, np.fmod: __mod__, np.absolute: abs, np.rint: round, np.sign: sign, np.exp: exp, np.exp2: exp2, np.log: log, np.log2: log2, np.log10: log10, np.expm1: expm1, np.log1p: log1p, np.sqrt: sqrt, np.square: square, np.reciprocal: reciprocal, np.ones_like: ones_like, np.zeros_like: zeros_like, np.sin: sin, np.cos: cos, np.tan: tan, np.arcsin: arcsin, np.arccos: arccos, np.arctan: arctan, np.deg2rad: deg2rad, np.rad2deg: rad2deg, np.greater: __gt__, np.greater_equal: __ge__, np.less: __lt__, np.less_equal: __le__, np.not_equal: __ne__, np.equal: __eq__, np.logical_not: logical_not, np.logical_and: __and__, np.logical_or: __or__, np.maximum: max, np.minimum: min, np.signbit: sign, np.ldexp: ldexp, np.dot:dot} + value.referenced = value.referenced + [self.op] + self.op.dml = self.op.dml + \ + ['\n', self.ID] + \ + getIndexingDML(index) + [' = ', getValue(value), '\n'] + + # Not implemented: conj, hyperbolic/inverse-hyperbolic functions(i.e. + # sinh, arcsinh, cosh, ...), bitwise operator, xor operator, isreal, + # iscomplex, isfinite, isinf, isnan, copysign, nextafter, modf, frexp, + # trunc + _numpy_to_systeml_mapping = { + np.add: __add__, + np.subtract: __sub__, + np.multiply: __mul__, + np.divide: __div__, + np.logaddexp: logaddexp, + np.true_divide: __truediv__, + np.floor_divide: __floordiv__, + np.negative: negative, + np.power: __pow__, + np.remainder: remainder, + np.mod: mod, + np.fmod: __mod__, + np.absolute: abs, + np.rint: round, + np.sign: sign, + np.exp: exp, + np.exp2: exp2, + np.log: log, + np.log2: log2, + np.log10: log10, + np.expm1: expm1, + np.log1p: log1p, + np.sqrt: sqrt, + np.square: square, + np.reciprocal: reciprocal, + np.ones_like: ones_like, + np.zeros_like: zeros_like, + np.sin: sin, + np.cos: cos, + np.tan: tan, + np.arcsin: arcsin, + np.arccos: arccos, + np.arctan: arctan, + np.deg2rad: deg2rad, + np.rad2deg: rad2deg, + np.greater: __gt__, + np.greater_equal: __ge__, + np.less: __lt__, + np.less_equal: __le__, + np.not_equal: __ne__, + np.equal: __eq__, + np.logical_not: logical_not, + np.logical_and: __and__, + np.logical_or: __or__, + np.maximum: max, + np.minimum: min, + np.signbit: sign, + np.ldexp: ldexp, + np.dot: dot}
