Github user mktal commented on a diff in the pull request:
https://github.com/apache/incubator-madlib/pull/10#discussion_r52060951
--- Diff: src/ports/postgres/modules/svm/kernel_approximation.py_in ---
@@ -0,0 +1,481 @@
+from __future__ import division
+
+import plpy
+
+from utilities.utilities import unique_string
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import num_features
+
+from math import sqrt
+from math import pi
+
+
+class GaussianKernelBase(object):
+ """docstring for gaussianKernel"""
+ def __init__(self, gamma, n_components, random_state,
+ random_weights, random_offset, id_col, val_col,
+ orig_data, **kwargs):
+ self.kernel_func = 'gaussian'
+ self.gamma = gamma
+ self.n_components = n_components
+ # int32 seed used by boost::minstd_rand
+ self.random_state = random_state
+ # random operators
+ self.rd_weights = random_weights
+ self.rd_offset = random_offset
+ # val column in random operators
+ self.rd_val = val_col
+ # id column in random operators
+ self.rd_id = id_col
+ self.transformed_table = dict()
+ self.original_table = dict()
+ # indicate whether rd_weights and rd_offset is view or table
+ # store the original data table name if they are view
+ # None if they are table
+ self.orig_data = orig_data
+
+ def clear(self):
+ data_type = 'view' if self.orig_data else 'table'
+ if self.rd_weights:
+ plpy.execute("drop {data_type} if exists {data};".format(
+ data=self.rd_weights,
+ data_type=data_type))
+ if self.rd_offset:
+ plpy.execute("drop {data_type} if exists {data};".format(
+ data=self.rd_offset,
+ data_type=data_type))
+
+ def __del__(self):
+ self.clear()
+
+ def saveAs(self, name):
+ if self.orig_data:
+ plpy.warning("Gaussian Kernel Warning: no need to save."
+ "Original data table exists: {0}"
+ .format(self.orig_data))
+ return
+
+ run_sql = """
+ create table {name} as
+ select
+ {id} as id, {val} as val,
+ 'offsets' as desp
+ from {rd_offset}
+ union
+ select
+ {id} as id, {val} as val,
+ 'weights' as desp
+ from {rd_weights}
+ """.format(name=name,
+ id=self.rd_id,
+ val=self.rd_val,
+ rd_offset=self.rd_offset,
+ rd_weights=self.rd_weights)
+ plpy.execute(run_sql)
+
+ @classmethod
+ def parse_params(cls, kernel_params='', n_features=10):
+ params_default = {
+ 'in_memory': 1,
+ 'gamma': 1/n_features,
+ 'random_state': 1,
+ 'n_components': 2*n_features}
+ params_types = {
+ 'in_memory': int,
+ 'gamma': float,
+ 'random_state': int,
+ 'n_components': int}
+ return extract_keyvalue_params(kernel_params,
+ params_types,
+ params_default)
+
+ @classmethod
+ def create(cls, schema_madlib, n_features, kernel_params):
+ params = cls.parse_params(kernel_params, n_features)
+ in_memory = params.pop('in_memory', True)
+ # according to the 1gb limit on each entry of the table
+ nelems = params['n_components']*n_features
+ if in_memory and nelems <= 1e8:
+ return GaussianKernelInMemory(schema_madlib, **params)
+ else:
+ return GaussianKernel(schema_madlib, **params)
+
+ @classmethod
+ def loadFrom(cls, schema_madlib, data, kernel_params=''):
+ rd_weights = unique_string(desp='random_weights')
+ rd_offset = unique_string(desp='random_offsets')
+ rd_val = unique_string(desp='val')
+ rd_id = unique_string(desp='id')
+ plpy.execute("""
+ drop view if exists {rd_weights};
+ create temp view {rd_weights} as
+ select id as {rd_id}, val as {rd_val} from {data}
+ where desp = 'weights';
+
+ drop view if exists {rd_offset};
+ create temp view {rd_offset} as
+ select id as {rd_id}, val as {rd_val} from {data}
+ where desp = 'offsets';
+ """.format(**locals()))
+ params = cls.parse_params(kernel_params)
+ in_memory = params.pop('in_memory', True)
--- End diff --
I understand. I used **kwargs in the base class but in the derived class I
am trying to make the interface more restrictive such that no unexpected
variables got passed to base class through, for example,
super().__init__(**locals()). Plus, it will be more clear to the caller this
way what are expected to create the object
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---