Repository: madlib Updated Branches: refs/heads/master 53db7363a -> 657cf4aa4
SVM: Add minibatch as a new solver This work is based on the original work by Xiaocheng Tang <xiaochen...@gmail.com> in #75. This PR adds two main features: - A Minibatch solver that takes as input a batch of data - SVM code that takes advantage of the minibatch Closes #229 Co-authored by: Nikhil Kak <n...@pivotal.io> Co-authored by: Xiaocheng Tang <xiaochen...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/657cf4aa Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/657cf4aa Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/657cf4aa Branch: refs/heads/master Commit: 657cf4aa4823731353aaa47db47b9a9b241edaaf Parents: 53db736 Author: Rahul Iyer <ri...@apache.org> Authored: Fri Feb 2 14:15:01 2018 -0800 Committer: Rahul Iyer <ri...@apache.org> Committed: Fri Feb 2 16:40:49 2018 -0800 ---------------------------------------------------------------------- .gitignore | 1 + src/dbal/EigenIntegration/HandleMap_proto.hpp | 3 +- src/modules/convex/algo/igd.hpp | 81 ++++++++- src/modules/convex/linear_svm_igd.cpp | 159 +++++++++++++++++ src/modules/convex/linear_svm_igd.hpp | 6 + src/modules/convex/task/linear_svm.hpp | 67 +++++++- src/modules/convex/type/model.hpp | 71 +++++--- src/modules/convex/type/state.hpp | 79 +++++++++ src/modules/convex/type/tuple.hpp | 3 + src/ports/postgres/modules/svm/svm.py_in | 170 +++++++++++++------ src/ports/postgres/modules/svm/svm.sql_in | 117 +++++++++++-- src/ports/postgres/modules/svm/test/svm.sql_in | 104 ++++++++++++ .../modules/utilities/validate_args.py_in | 19 +-- 13 files changed, 774 insertions(+), 106 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 00dc016..a073fbd 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ auto *.swp *.fdb_latexmk *.swo # vim swap file +\#*\# # emacs backup file # Biblatex temporary files *-blx.bib http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/dbal/EigenIntegration/HandleMap_proto.hpp ---------------------------------------------------------------------- diff --git a/src/dbal/EigenIntegration/HandleMap_proto.hpp b/src/dbal/EigenIntegration/HandleMap_proto.hpp index 4bfe7c5..c7b6cb2 100644 --- a/src/dbal/EigenIntegration/HandleMap_proto.hpp +++ b/src/dbal/EigenIntegration/HandleMap_proto.hpp @@ -19,6 +19,7 @@ namespace eigen_integration { template <class EigenType, class Handle, int MapOptions = Eigen::Unaligned> class HandleMap : public Eigen::Map<EigenType, MapOptions> { public: + typedef EigenType PlainEigenType; typedef Eigen::Map<EigenType, MapOptions> Base; typedef typename Base::Scalar Scalar; typedef typename Base::Index Index; @@ -57,7 +58,7 @@ public: * * For example, this allows construction of MappedColumnVector from * MappedMatrix::col(int) or NativeColumnVector, etc. - */ + */ template <class Derived> HandleMap(const Eigen::MapBase<Derived>& inMappedData, typename boost::enable_if_c<Derived::IsVectorAtCompileTime>::type* = 0) http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/modules/convex/algo/igd.hpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/algo/igd.hpp b/src/modules/convex/algo/igd.hpp index cd17e64..3ae4c13 100644 --- a/src/modules/convex/algo/igd.hpp +++ b/src/modules/convex/algo/igd.hpp @@ -34,7 +34,9 @@ public: typedef typename Task::model_type model_type; static void transition(state_type &state, const tuple_type &tuple); + static void transitionInMiniBatch(state_type &state, const tuple_type &tuple); static void merge(state_type &state, const_state_type &otherState); + static void mergeInPlace(state_type &state, const_state_type &otherState); static void final(state_type &state); }; @@ -56,6 +58,62 @@ IGD<State, ConstState, Task>::transition(state_type &state, state.task.stepsize * tuple.weight); } +/** + * @brief Update the transition state in mini-batches + * + * Note: We assume that + * 1. Task defines a model_eigen_type + * 2. A batch of tuple.indVar is a Matrix + * 3. A batch of tuple.depVar is a ColumnVector + * 4. Task defines a getLossAndUpdateModel method + * + */ + template <class State, class ConstState, class Task> + void + IGD<State, ConstState, Task>::transitionInMiniBatch( + state_type &state, + const tuple_type &tuple) { + + madlib_assert(tuple.indVar.rows() == tuple.depVar.rows(), + std::runtime_error("Invalid data. Independent and dependent " + "batches don't have same number of rows.")); + + int batch_size = state.algo.batchSize; + int n_epochs = state.algo.nEpochs; + + // n_rows/n_ind_cols are the rows/cols in a transition tuple. + int n_rows = tuple.indVar.rows(); + int n_ind_cols = tuple.indVar.cols(); + int n_batches = n_rows < batch_size ? 1 : + n_rows / batch_size + + int(n_rows%batch_size > 0); + + for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) { + double loss = 0.0; + for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches; + curr_batch++, curr_batch_row_index += batch_size) { + Matrix X_batch; + ColumnVector y_batch; + if (curr_batch == n_batches-1) { + // last batch + X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index); + y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index); + } else { + X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols); + y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size); + } + loss += Task::getLossAndUpdateModel( + state.task.model, X_batch, y_batch, state.task.stepsize); + } + + // The first epoch will most likely have the highest loss. + // Being pessimistic, use the total loss only from the first epoch. + if (curr_epoch==0) state.algo.loss += loss; + } + return; + } + + template <class State, class ConstState, class Task> void IGD<State, ConstState, Task>::merge(state_type &state, @@ -86,11 +144,32 @@ IGD<State, ConstState, Task>::merge(state_type &state, template <class State, class ConstState, class Task> void +IGD<State, ConstState, Task>::mergeInPlace(state_type &state, + const_state_type &otherState) { + // avoid division by zero + if (state.algo.numRows == 0) { + state.task.model = otherState.task.model; + return; + } else if (otherState.algo.numRows == 0) { + return; + } + + // model averaging, weighted by rows seen + double leftRows = static_cast<double>(state.algo.numRows + state.algo.numRows); + double rightRows = static_cast<double>(otherState.algo.numRows + otherState.algo.numRows); + double totalNumRows = leftRows + rightRows; + state.task.model *= leftRows / rightRows; + state.task.model += otherState.task.model; + state.task.model *= rightRows / totalNumRows; +} + +template <class State, class ConstState, class Task> +void IGD<State, ConstState, Task>::final(state_type &state) { // The reason that we have to keep the task.model untouched in transition // funtion: loss computation needs the model from last iteration cleanly - state.task.model = state.algo.incrModel; + } } // namespace convex http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/modules/convex/linear_svm_igd.cpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/linear_svm_igd.cpp b/src/modules/convex/linear_svm_igd.cpp index f396250..90882a3 100644 --- a/src/modules/convex/linear_svm_igd.cpp +++ b/src/modules/convex/linear_svm_igd.cpp @@ -32,6 +32,10 @@ typedef IGD<GLMIGDState<MutableArrayHandle<double> >, GLMIGDState<ArrayHandle<double> >, LinearSVM<GLMModel, GLMTuple > > LinearSVMIGDAlgorithm; +typedef IGD<SVMMinibatchState<MutableArrayHandle<double> >, + SVMMinibatchState<ArrayHandle<double> >, + LinearSVM<GLMModel, SVMMiniBatchTuple > > LinearSVMIGDAlgoMiniBatch; + typedef Loss<GLMIGDState<MutableArrayHandle<double> >, GLMIGDState<ArrayHandle<double> >, LinearSVM<GLMModel, GLMTuple > > LinearSVMLossAlgorithm; @@ -121,6 +125,98 @@ linear_svm_igd_transition::run(AnyType &args) { } /** + * @brief Perform the linear support vector machine transition step + * + * Called for each tuple. + */ +AnyType +linear_svm_igd_minibatch_transition::run(AnyType &args) { + // The real state. + // For the first tuple: args[0] is nothing more than a marker that + // indicates that we should do some initial operations. + // For other tuples: args[0] holds the computation state until last tuple + SVMMinibatchState<MutableArrayHandle<double> > state = args[0]; + + // initialize the state if first tuple + if (state.algo.numRows == 0) { + + LinearSVM<GLMModel, GLMTuple >::epsilon = args[9].getAs<double>();; + LinearSVM<GLMModel, GLMTuple >::is_svc = args[10].getAs<bool>();; + if (!args[3].isNull()) { + SVMMinibatchState<ArrayHandle<double> > previousState = args[3]; + state.allocate(*this, previousState.task.nFeatures); + state = previousState; + } else { + // configuration parameters + uint32_t dimension = args[4].getAs<uint32_t>(); + state.allocate(*this, dimension); // with zeros + } + // resetting in either case + // state.reset(); + state.task.stepsize = args[5].getAs<double>(); + const double lambda = args[6].getAs<double>(); + const bool isL2 = args[7].getAs<bool>(); + const int nTuples = args[8].getAs<int>(); + + // The regularization operations called below (scaling and clipping) + // need these class variables to be set. + L1<GLMModel>::n_tuples = nTuples; + L2<GLMModel>::n_tuples = nTuples; + if (isL2) + L2<GLMModel>::lambda = lambda; + else + L1<GLMModel>::lambda = lambda; + } + + state.algo.nEpochs = args[12].getAs<int>(); + state.algo.batchSize = args[13].getAs<int>(); + + // Skip the current record if args[1] (features) contains NULL values, + // or args[2] is NULL + try { + args[1].getAs<MappedMatrix>(); + } catch (const ArrayWithNullException &e) { + return args[0]; + } + if (args[2].isNull()) + return args[0]; + + // tuple + using madlib::dbal::eigen_integration::MappedColumnVector; + + MappedMatrix x(NULL); + MappedColumnVector y(NULL); + try { + new (&x) MappedMatrix(args[1].getAs<MappedMatrix>()); + new (&y) MappedColumnVector(args[2].getAs<MappedColumnVector>()); + } catch (const ArrayWithNullException &e) { + return args[0]; + } + SVMMiniBatchTuple tuple; + tuple.indVar = trans(x); + tuple.depVar = y; + + // each tuple can be weighted - this can be combination of the sample weight + // and the class weight. Calling function is responsible for combining the two + // into a single tuple weight. The default value for this parameter is 1, set + // into the definition of "tuple". + // The weight is used to increase the value of a particular tuple for the online + // learning. The weight is not used for the loss computation. + tuple.weight = args[11].getAs<double>(); + + + // Now do the transition step + // apply Minibatching with regularization + L2<GLMModel>::scaling(state.task.model, state.task.stepsize); + LinearSVMIGDAlgoMiniBatch::transitionInMiniBatch(state, tuple); + L1<GLMModel>::clipping(state.task.model, state.task.stepsize); + + state.algo.numRows += x.cols(); + return state; +} + + +/** * @brief Perform the perliminary aggregation function: Merge transition states */ AnyType @@ -146,6 +242,30 @@ linear_svm_igd_merge::run(AnyType &args) { } /** + * @brief Perform the perliminary aggregation function: Merge transition states + */ +AnyType +linear_svm_igd_minibatch_merge::run(AnyType &args) { + SVMMinibatchState<MutableArrayHandle<double> > stateLeft = args[0]; + SVMMinibatchState<ArrayHandle<double> > stateRight = args[1]; + + // We first handle the trivial case where this function is called with one + // of the states being the initial state + if (stateLeft.algo.numRows == 0) { return stateRight; } + else if (stateRight.algo.numRows == 0) { return stateLeft; } + + // Merge states together + LinearSVMIGDAlgoMiniBatch::mergeInPlace(stateLeft, stateRight); + + // The following numRows update, cannot be put above, because the model + // averaging depends on their original values + stateLeft.algo.numRows += stateRight.algo.numRows; + stateLeft.algo.loss += stateRight.algo.loss; + + return stateLeft; +} + +/** * @brief Perform the linear support vector machine final step */ AnyType @@ -172,6 +292,29 @@ linear_svm_igd_final::run(AnyType &args) { } /** + * @brief Perform the linear support vector machine final step + */ +AnyType +linear_svm_igd_minibatch_final::run(AnyType &args) { + // We request a mutable object. Depending on the backend, this might perform + // a deep copy. + SVMMinibatchState<MutableArrayHandle<double> > state = args[0]; + // Aggregates that haven't seen any data just return Null. + if (state.algo.numRows == 0) { return Null(); } + state.algo.loss = state.algo.loss / state.algo.numRows; + return state; +} + +AnyType +internal_linear_svm_igd_minibatch_distance::run(AnyType &args) { + SVMMinibatchState<ArrayHandle<double> > stateLeft = args[0]; + SVMMinibatchState<ArrayHandle<double> > stateRight = args[1]; + + return std::abs((stateLeft.algo.loss - stateRight.algo.loss) + / stateLeft.algo.loss); +} + +/** * @brief Return the difference in RMSE between two states */ AnyType @@ -199,6 +342,22 @@ internal_linear_svm_igd_result::run(AnyType &args) { return tuple; } +/** + * @brief Return the coefficients and diagnostic statistics of the state + */ +AnyType +internal_linear_svm_igd_minibatch_result::run(AnyType &args) { + SVMMinibatchState<ArrayHandle<double> > state = args[0]; + + AnyType tuple; + tuple << state.task.model + << static_cast<double>(state.algo.loss) + << 0. + << static_cast<int64_t>(state.algo.numRows); + + return tuple; +} + } // namespace convex } // namespace modules http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/modules/convex/linear_svm_igd.hpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/linear_svm_igd.hpp b/src/modules/convex/linear_svm_igd.hpp index afe169e..0958b0f 100644 --- a/src/modules/convex/linear_svm_igd.hpp +++ b/src/modules/convex/linear_svm_igd.hpp @@ -8,26 +8,32 @@ * @brief Linear support vector machine (incremental gradient): Transition function */ DECLARE_UDF(convex, linear_svm_igd_transition) +DECLARE_UDF(convex, linear_svm_igd_minibatch_transition) /** * @brief Linear support vector machine (incremental gradient): State merge function */ DECLARE_UDF(convex, linear_svm_igd_merge) +DECLARE_UDF(convex, linear_svm_igd_minibatch_merge) /** * @brief Linear support vector machine (incremental gradient): Final function */ DECLARE_UDF(convex, linear_svm_igd_final) +DECLARE_UDF(convex, linear_svm_igd_minibatch_final) /** * @brief Linear support vector machine (incremental gradient): Difference in * log-likelihood between two transition states */ DECLARE_UDF(convex, internal_linear_svm_igd_distance) +DECLARE_UDF(convex, internal_linear_svm_igd_minibatch_distance) + /** * @brief Linear support vector machine (incremental gradient): Convert * transition state to result tuple */ DECLARE_UDF(convex, internal_linear_svm_igd_result) +DECLARE_UDF(convex, internal_linear_svm_igd_minibatch_result) http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/modules/convex/task/linear_svm.hpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/task/linear_svm.hpp b/src/modules/convex/task/linear_svm.hpp index 136d73b..7146432 100644 --- a/src/modules/convex/task/linear_svm.hpp +++ b/src/modules/convex/task/linear_svm.hpp @@ -22,10 +22,16 @@ class LinearSVM { public: typedef Model model_type; typedef Tuple tuple_type; - typedef typename Tuple::independent_variables_type - independent_variables_type; + + typedef typename Tuple::independent_variables_type independent_variables_type; typedef typename Tuple::dependent_variable_type dependent_variable_type; + // Model is assumed to be base Eigen type or Eigen map and the 'EigenType' + // variable infers the actual type from the Model definition. + // For eg. SVMModel is defined as a ColumnVectorTransparentHandleMap which + // has a ColumnVector as its EigenType. + typedef typename model_type::PlainEigenType coefficient_type; + static double epsilon; static bool is_svc; @@ -41,6 +47,12 @@ public: const dependent_variable_type &y, const double &stepsize); + static double getLossAndUpdateModel( + model_type &model, + const independent_variables_type &x, + const dependent_variable_type &y, + const double &stepsize); + static double loss( const model_type &model, const independent_variables_type &x, @@ -101,6 +113,57 @@ LinearSVM<Model, Tuple>::gradientInPlace( } } +/** +* @brief This function will update the model for a single batch and return the loss +* @param model Model to update +* @param x Batch of independent variables +* @param y Batch of dependent variables +* @param stepsize Learning rate for model update +* @return Total loss in the batch +*/ +template <class Model, class Tuple> +double +LinearSVM<Model, Tuple>::getLossAndUpdateModel( + model_type &model, + const independent_variables_type &x, + const dependent_variable_type &y, + const double &stepsize){ + + // This function is called by the minibatch transition function to update + // the model for each batch. x and y in the function signature are defined + // as generic variables to ensure a consistent interface across all modules. + + // ASSUMPTION: 'gradient' will always be of the same type as the + // coefficients. In SVM, the model is just the coefficients, but can be + // more complex with other modules like MLP. + coefficient_type gradient = model; + gradient.setZero(); + coefficient_type w_transpose_x = x * model; + double loss = 0.0; + int batch_size = x.rows(); + double dist_from_hyperplane = 0.0; + double c = 0.0; + int n_points_with_positive_dist = 0; + for (int i = 0; i < batch_size; i++) { + if (is_svc) { + c = -y(i); // minus for "-loglik" + dist_from_hyperplane = 1.0 - w_transpose_x(i) * y(i); + } else { + double wx_y = w_transpose_x(i) - y(i); + c = wx_y > 0 ? 1.0 : -1.0; + dist_from_hyperplane = c * wx_y - epsilon; + } + if (dist_from_hyperplane > 0.) { + gradient += c * x.row(i); + loss += dist_from_hyperplane; + n_points_with_positive_dist++; + } + } + gradient.array() /= n_points_with_positive_dist; + model -= stepsize * gradient; + return loss; +} + template <class Model, class Tuple> double LinearSVM<Model, Tuple>::loss( http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/modules/convex/type/model.hpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/type/model.hpp b/src/modules/convex/type/model.hpp index 679dab4..4f534e4 100644 --- a/src/modules/convex/type/model.hpp +++ b/src/modules/convex/type/model.hpp @@ -93,17 +93,20 @@ struct LMFModel { } }; -// Generalized Linear Models (GLMs): Logistic regression, Linear SVM typedef HandleTraits<MutableArrayHandle<double> >::ColumnVectorTransparentHandleMap GLMModel; +typedef HandleTraits<MutableArrayHandle<double> >::ColumnVectorTransparentHandleMap + SVMModel; + // The necessity of this wrapper is to allow classes in algo/ and task/ to // have a type that they can template over template <class Handle> struct MLPModel { - typename HandleTraits<Handle>::ReferenceToUInt16 is_classification; - typename HandleTraits<Handle>::ReferenceToUInt16 activation; - std::vector<Eigen::Map<Matrix > > u; + typename HandleTraits<Handle>::ReferenceToDouble is_classification; + typename HandleTraits<Handle>::ReferenceToDouble activation; + // std::vector<Eigen::Map<Matrix > > u; + std::vector<MutableMappedMatrix> u; /** * @brief Space needed. @@ -120,8 +123,8 @@ struct MLPModel { size_t N = inNumberOfStages; const double *n = inNumbersOfUnits; size_t k; - for (k = 1; k <= N; k ++) { - size += (n[k-1] + 1) * (n[k]); + for (k = 0; k < N; k ++) { + size += (n[k] + 1) * (n[k+1]); } return size; // weights (u) } @@ -140,71 +143,87 @@ struct MLPModel { uint32_t sizeOfU = 0; u.clear(); - for (k = 1; k <= N; k ++) { - u.push_back(Eigen::Map<Matrix >( - const_cast<double*>(data + sizeOfU), - n[k-1] + 1, n[k])); - sizeOfU += (n[k-1] + 1) * (n[k]); + for (k = 0; k < N; k ++) { + // u.push_back(Eigen::Map<Matrix >( + // const_cast<double*>(data + sizeOfU), + // n[k] + 1, n[k+1])); + u.push_back(MutableMappedMatrix()); + u[k].rebind(const_cast<double *>(data + sizeOfU), n[k] + 1, n[k+1]); + sizeOfU += (n[k] + 1) * (n[k+1]); } return sizeOfU; } + void initialize(const uint16_t &inNumberOfStages, + const double *inNumbersOfUnits){ + size_t N = inNumberOfStages; + const double *n = inNumbersOfUnits; + size_t k; + double span; + for (k =0; k < N; ++k){ + // Initalize according to Glorot and Bengio (2010) + // See design doc for more info + span = sqrt(6.0 / (n[k] + n[k+1])); + u[k] << span * Matrix::Random(u[k].rows(), u[k].cols()); + } + } + double norm() const { double norm = 0.; size_t k; for (k = 0; k < u.size(); k ++) { - norm+=u[k].bottomRows(u[k].rows()-1).squaredNorm(); + norm += u[k].bottomRows(u[k].rows()-1).squaredNorm(); } return std::sqrt(norm); } void setZero(){ size_t k; - for (k = 1; k <= u.size(); k ++) { - u[k-1].setZero(); + for (k = 0; k < u.size(); k ++) { + u[k].setZero(); } } /* * Some operator wrappers for u. */ - MLPModel &operator*=(const double &c) { + MLPModel& operator*=(const double &c) { // Note that when scaling the model, you should // not update the bias. size_t k; - for (k = 1; k <= u.size(); k ++) { - u[k-1] *= c; + for (k = 0; k < u.size(); k ++) { + u[k] *= c; } return *this; } template<class OtherHandle> - MLPModel &operator-=(const MLPModel<OtherHandle> &inOtherModel) { + MLPModel& operator-=(const MLPModel<OtherHandle> &inOtherModel) { size_t k; - for (k = 1; k <= u.size() && k <= inOtherModel.u.size(); k ++) { - u[k-1] -= inOtherModel.u[k-1]; + for (k = 0; k < u.size() && k < inOtherModel.u.size(); k ++) { + u[k] -= inOtherModel.u[k]; } return *this; } template<class OtherHandle> - MLPModel &operator+=(const MLPModel<OtherHandle> &inOtherModel) { + MLPModel& operator+=(const MLPModel<OtherHandle> &inOtherModel) { size_t k; - for (k = 1; k <= u.size() && k <= inOtherModel.u.size(); k ++) { - u[k-1] += inOtherModel.u[k-1]; + for (k = 0; k < u.size() && k < inOtherModel.u.size(); k ++) { + u[k] += inOtherModel.u[k]; } return *this; } template<class OtherHandle> - MLPModel &operator=(const MLPModel<OtherHandle> &inOtherModel) { + MLPModel& operator=(const MLPModel<OtherHandle> &inOtherModel) { size_t k; - for (k = 1; k <= u.size() && k <= inOtherModel.u.size(); k ++) { - u[k-1] = inOtherModel.u[k-1]; + for (k = 0; k < u.size() && k < inOtherModel.u.size(); k ++) { + u[k] = inOtherModel.u[k]; } is_classification = inOtherModel.is_classification; activation = inOtherModel.activation; http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/modules/convex/type/state.hpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/type/state.hpp b/src/modules/convex/type/state.hpp index 2cb2643..f846e8f 100644 --- a/src/modules/convex/type/state.hpp +++ b/src/modules/convex/type/state.hpp @@ -295,6 +295,85 @@ public: } algo; }; +template <class Handle> +class SVMMinibatchState { + template <class OtherHandle> + friend class SVMMinibatchState; + +public: + SVMMinibatchState(const AnyType &inArray) : mStorage(inArray.getAs<Handle>()) { + rebind(); + } + + /** + * @brief Convert to backend representation + * + * We define this function so that we can use State in the + * argument list and as a return type. + */ + inline operator AnyType() const { + return mStorage; + } + + /** + * @brief Allocating the state. + */ + inline void allocate(const Allocator &inAllocator, uint32_t nFeatures) { + mStorage = inAllocator.allocateArray<double, dbal::AggregateContext, + dbal::DoZero, dbal::ThrowBadAlloc>(arraySize(nFeatures)); + + rebind(); + task.nFeatures = nFeatures; + rebind(); + } + + /** + * @brief We need to support assigning the previous state + */ + template <class OtherHandle> + SVMMinibatchState &operator=(const SVMMinibatchState<OtherHandle> &inOtherState) { + for (size_t i = 0; i < mStorage.size(); i++) { + mStorage[i] = inOtherState.mStorage[i]; + } + + return *this; + } + + static inline uint32_t arraySize(const uint32_t nFeatures) { + return 8 + nFeatures; + } + +protected: + void rebind() { + task.nFeatures.rebind(&mStorage[0]); + task.stepsize.rebind(&mStorage[1]); + algo.numRows.rebind(&mStorage[2]); + algo.loss.rebind(&mStorage[3]); + task.reg.rebind(&mStorage[4]); + algo.batchSize.rebind(&mStorage[5]); + algo.nEpochs.rebind(&mStorage[6]); + task.model.rebind(&mStorage[8], task.nFeatures); + } + + Handle mStorage; + +public: + struct TaskState { + typename HandleTraits<Handle>::ReferenceToUInt32 nFeatures; + typename HandleTraits<Handle>::ReferenceToDouble stepsize; + typename HandleTraits<Handle>::ReferenceToDouble reg; + typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap model; + } task; + + struct AlgoState { + typename HandleTraits<Handle>::ReferenceToUInt64 numRows; + typename HandleTraits<Handle>::ReferenceToDouble loss; + typename HandleTraits<Handle>::ReferenceToUInt32 batchSize; + typename HandleTraits<Handle>::ReferenceToUInt32 nEpochs; + } algo; +}; + + /** * @brief Inter- (Task State) and intra-iteration (Algo State) state of * Conjugate Gradient for generalized linear models http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/modules/convex/type/tuple.hpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/type/tuple.hpp b/src/modules/convex/type/tuple.hpp index 824ed90..ac070b6 100644 --- a/src/modules/convex/type/tuple.hpp +++ b/src/modules/convex/type/tuple.hpp @@ -61,6 +61,9 @@ using madlib::dbal::eigen_integration::MappedColumnVector; // Generalized Linear Models (GLMs): Logistic regression, Linear SVM typedef ExampleTuple<MappedColumnVector, double> GLMTuple; +typedef ExampleTuple<MappedColumnVector, double> SVMTuple; +typedef ExampleTuple<Matrix, ColumnVector> SVMMiniBatchTuple; + // madlib::modules::convex::MatrixIndex typedef ExampleTuple<MatrixIndex, double> LMFTuple; http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/ports/postgres/modules/svm/svm.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index 4760f36..a57a135 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -36,36 +36,60 @@ def _compute_svm(args): """ init_stepsize = args['init_stepsize'] args['stepsize'] = init_stepsize + batch_size = args['batch_size'] + args['dist_func'] = ('internal_linear_svm_igd_distance' if batch_size == 1 + else 'internal_linear_svm_igd_minibatch_distance') iterationCtrl = GroupIterationController(args) with iterationCtrl as it: it.iteration = 0 has_converged = False while not has_converged: - it.update( - """ - {schema_madlib}.linear_svm_igd_step( - ({col_ind_var})::FLOAT8[], - ({col_dep_var_trans})::FLOAT8, - {rel_state}.{col_grp_state}, - {n_features}::INT4, - {stepsize}::FLOAT8, - {lambda}::FLOAT8, - {is_l2}::BOOLEAN, - {col_n_tuples}, - ({select_epsilon})::FLOAT8, - {is_svc}::BOOLEAN, - {class_weight_sql}::FLOAT8 - ) - """) + if batch_size == 1: + it.update( + """ + {schema_madlib}.linear_svm_igd_step( + ({col_ind_var})::FLOAT8[], + ({col_dep_var_trans})::FLOAT8, + {rel_state}.{col_grp_state}, + {n_features}::INT4, + {stepsize}::FLOAT8, + {lambda}::FLOAT8, + {is_l2}::BOOLEAN, + {col_n_tuples}, + ({select_epsilon})::FLOAT8, + {is_svc}::BOOLEAN, + {class_weight_sql}::FLOAT8 + ) + """) + else: + it.update( + """ + {schema_madlib}.linear_svm_igd_minibatch_step( + ({col_ind_var})::FLOAT8[][], + ({col_dep_var_trans})::FLOAT8[], + {rel_state}.{col_grp_state}, + {n_features}::INT4, + {stepsize}::FLOAT8, + {lambda}::FLOAT8, + {is_l2}::BOOLEAN, + {col_n_tuples}, + ({select_epsilon})::FLOAT8, + {is_svc}::BOOLEAN, + {class_weight_sql}::FLOAT8, + {n_epochs}::INTEGER, + {batch_size}::INTEGER + ) + """) it.info() if it.kwargs['decay_factor'] > 0: it.kwargs['stepsize'] *= it.kwargs['decay_factor'] else: it.kwargs['stepsize'] = init_stepsize / (it.iteration + 1) + has_converged = it.test( """ {iteration} >= {max_iter} - OR {schema_madlib}.internal_linear_svm_igd_distance( + OR {schema_madlib}.{dist_func}( _state_previous, _state_current) < {tolerance} """) it.final() @@ -89,9 +113,9 @@ def _verify_table(source_table, model_table, dependent_varname, "('{dependent_varname}') for source_table " "({source_table})!".format(dependent_varname=dependent_varname, source_table=source_table)) - dep_type = get_expr_type(dependent_varname, source_table) - if '[]' in dep_type: - plpy.error("SVM error: dependent_varname cannot be of array type!") + # dep_type = get_expr_type(dependent_varname, source_table) + # if '[]' in dep_type: + # plpy.error("SVM error: dependent_varname cannot be of array type!") # validate output tables output_tbl_valid(model_table, 'SVM') @@ -157,6 +181,8 @@ def _build_output_tables(n_iters_run, args, **kwargs): else: groupby_str, grouping_str1, using_str = "", "", "ON TRUE" # organizing results + result_func = ("internal_linear_svm_igd_result" if args['batch_size'] == 1 + else "internal_linear_svm_igd_minibatch_result") args.update(locals()) model_table_query = """ CREATE TABLE {model_table} AS @@ -173,7 +199,7 @@ def _build_output_tables(n_iters_run, args, **kwargs): FROM ( SELECT - {schema_madlib}.internal_linear_svm_igd_result( + {schema_madlib}.{result_func}( {col_grp_state} ) AS result, {col_grp_key} @@ -952,9 +978,9 @@ def svm(schema_madlib, source_table, model_table, _verify_table(source_table, model_table, dependent_varname, independent_varname) reserved_cols =['coef', 'random_feature_data', - 'random_feature_data', 'loss' - 'num_rows_processed', 'num_rows_skipped', - 'norm_of_gradient', 'num_iterations'] + 'random_feature_data', 'loss' + 'num_rows_processed', 'num_rows_skipped', + 'norm_of_gradient', 'num_iterations'] grouping_str, grouping_col = \ get_grouping_col_str(schema_madlib, 'SVM', reserved_cols, source_table, grouping_col) @@ -1151,7 +1177,15 @@ def _svm_parsed_params(schema_madlib, source_table, model_table, datasets. """ - n_features = num_features(source_table, independent_varname) + # n_features = num_features(source_table, independent_varname) + + upper_dim = 1 if params_dict['batch_size'] == 1 else 2 + n_features = plpy.execute("SELECT array_upper({0}, {2}) AS dim " + "FROM {1} LIMIT 1". + format(independent_varname, + source_table, + upper_dim))[0]['dim'] + if update_source_for_one_class: # This block is run only when the caller is svm_one_class @@ -1204,7 +1238,9 @@ def _svm_parsed_params(schema_madlib, source_table, model_table, args.update(_verify_get_params_dict(params_dict)) args.update(_process_epsilon(is_svc, args)) - args.update(_svc_or_svr(is_svc, source_table, dependent_varname)) + + is_sgd = params_dict['batch_size'] <= 1 + args.update(_svc_or_svr(is_svc, source_table, dependent_varname, is_sgd)) # place holder for compatibility plpy.execute("CREATE TABLE pg_temp.{0} AS SELECT 1".format(args['rel_args'])) @@ -1254,13 +1290,13 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, input_tbl_valid(new_data_table, 'SVM') reserved_cols =['coef', 'random_feature_data', - 'random_feature_data', 'loss' - 'num_rows_processed', 'num_rows_skipped', - 'norm_of_gradient', 'num_iterations'] + 'random_feature_data', 'loss' + 'num_rows_processed', 'num_rows_skipped', + 'norm_of_gradient', 'num_iterations'] grouping_str, grouping_col = get_grouping_col_str(schema_madlib, - 'SVM', reserved_cols, - new_data_table, - grouping_col) + 'SVM', reserved_cols, + new_data_table, + grouping_col) _assert(is_var_valid(new_data_table, independent_varname), "SVM Error: independent_varname ('" + independent_varname + "') is invalid for new_data_table (" + new_data_table + ")!") @@ -1341,43 +1377,67 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, # ----------------------------------------------------------------------------- -def _svc_or_svr(is_svc, source_table, dependent_varname): +def _svc_or_svr(is_svc, source_table, dependent_varname, is_sgd): # transform col_dep_var to binary (1`or -1) if classification _args = {'col_dep_var_trans': dependent_varname, 'mapping': 'NULL', 'method': 'SVR'} if is_svc: + if is_sgd: + src = source_table + else: + src = "(SELECT unnest({0}) as {0} FROM {1}) q".format(dependent_varname, source_table) # dependent variable mapping dep_labels = plpy.execute(""" SELECT {dependent_varname} AS y - FROM {source_table} + FROM {src} WHERE ({dependent_varname}) IS NOT NULL GROUP BY ({dependent_varname}) ORDER BY ({dependent_varname}) - """.format(source_table=source_table, + """.format(src=src, dependent_varname=dependent_varname)) - dep_var_mapping = ["'{0}'".format(d['y']) - if isinstance(d['y'], basestring) - else str(d['y']) for d in dep_labels] - - _assert(1 <= len(dep_var_mapping) <= 2, + _assert(1 <= len(dep_labels) <= 2, "SVM Error: Classification currently " "only supports unary or binary output!. Found values {0}". - format(dep_var_mapping)) - - col_dep_var_trans = (""" - CASE WHEN ({col_dep_var}) IS NULL THEN NULL - WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN -1.0 - ELSE 1.0 - END - """.format(col_dep_var=dependent_varname, - mapped_value_for_negative=dep_var_mapping[0])) + format(dep_labels)) + + dep_labels_str = ["'{0}'".format(d['y']) + if isinstance(d['y'], basestring) + else str(d['y']) for d in dep_labels] + + # map the dependent variable labels to -1 and 1 to represent the two + # sides of the hyperplane (only supporting unary/binary for now) + if is_sgd: + col_dep_var_trans = """ + CASE WHEN ({dependent_varname}) IS NULL THEN NULL + WHEN ({dependent_varname}) = {mapped_value_for_negative} THEN -1.0 + ELSE 1.0 + END + """ + else: + # For minibatch, the dependent_varname is an array. So unnest the + # array before mapping to 1/-1. + col_dep_var_trans = """ + ARRAY(SELECT + CASE WHEN ({dependent_varname}) IS NULL THEN NULL + WHEN ({dependent_varname}) = {mapped_value_for_negative} THEN -1.0 + ELSE 1.0 + END + FROM UNNEST({dependent_varname}) as {dependent_varname} + ) + """ + + # col_dep_var_trans is used by the update query in _compute_svm to + # transform dependent variable labels to -1 and 1. + col_dep_var_trans = col_dep_var_trans.format( + dependent_varname=dependent_varname, + mapped_value_for_negative=dep_labels_str[0]) _args.update({ - 'mapped_value_for_negative': dep_var_mapping[0], + 'mapped_value_for_negative': dep_labels_str[0], 'col_dep_var_trans': col_dep_var_trans, - 'mapping': dep_var_mapping[0] + "," + dep_var_mapping[1], + 'mapping': dep_labels_str[0] + "," + dep_labels_str[1], 'method': 'SVC'}) return _args # ----------------------------------------------------------------------------- @@ -1489,7 +1549,9 @@ def _extract_params(schema_madlib, params, module='SVM'): 'validation_result': '', 'epsilon': [0.01], 'eps_table': '', - 'class_weight': ''} + 'class_weight': '', + 'n_epochs': 1, + 'batch_size': 1} params_types = { 'init_stepsize': list, @@ -1502,7 +1564,9 @@ def _extract_params(schema_madlib, params, module='SVM'): 'validation_result': str, 'epsilon': list, 'eps_table': str, - 'class_weight': str} + 'class_weight': str, + 'n_epochs': int, + 'batch_size': int} params_vals = extract_keyvalue_params(params, params_types, params_default) if params_vals['n_folds'] < 0: http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/ports/postgres/modules/svm/svm.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/svm/svm.sql_in b/src/ports/postgres/modules/svm/svm.sql_in index f3948a8..78d7b76 100644 --- a/src/ports/postgres/modules/svm/svm.sql_in +++ b/src/ports/postgres/modules/svm/svm.sql_in @@ -79,7 +79,7 @@ svm_classification( <DD>TEXT. Expression list to evaluate for the independent variables. An intercept variable should not be included as part of this expression. See 'fit_intercept' in the kernel params for info on - intercepts. Please note that expression should be able to be cast + intercepts. Please note that expression should be able to be cast to DOUBLE PRECISION[]. <DT>kernel_func (optional)</DT> @@ -339,7 +339,7 @@ the parameter is ignored. Hyperparameter optimization can be carried out using the built-in cross validation mechanism, which is activated by assigning a value greater than 1 to -the parameter \e n_folds in \e params. +the parameter \e n_folds in \e params. Please note that cross validation is not supported if grouping is used. @@ -442,7 +442,7 @@ while the other k - 1 folds form the training set. </DD> <DT>class_weight</dt> -<DD>Default: 1 for classification, 'balanced' for one-class novelty detection, +<DD>Default: 1 for classification, 'balanced' for one-class novelty detection, n/a for regression. Set the weight for the positive and negative classes. If not given, all classes @@ -495,8 +495,8 @@ table name is already in use, then an error is returned. Table contains:</DD> </tr> <tr> <th>prediction</th> - <td>Provides the prediction for each row in new_data_table. - For regression this would be the same as decision_function. For classification, + <td>Provides the prediction for each row in new_data_table. + For regression this would be the same as decision_function. For classification, this will be one of the dependent variable values.</td> </tr> <tr> @@ -645,9 +645,9 @@ num_rows_skipped | -1 dep_var_mapping | {-1,1} </pre> -# Now let's look at the prediction functions. We want to predict if house price -is less than $100,000. In the following examples we will +is less than $100,000. In the following examples we will use the training data set for prediction as well, which is not usual but serves to -show the syntax. The predicted results are in the \e prediction column and the +show the syntax. The predicted results are in the \e prediction column and the actual data is in the \e target column. For the linear model: <pre class="example"> @@ -657,7 +657,7 @@ SELECT *, price < 100000 AS target FROM houses JOIN houses_pred USING (id) ORDER </pre> Result: <pre class="result"> - id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target + id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target ----+------+---------+------+--------+------+-------+------------+--------------------+-------- 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | t | 104.685894748292 | t 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | t | 200.592436923938 | t @@ -683,7 +683,7 @@ SELECT *, price < 100000 AS target FROM houses JOIN houses_pred_gaussian USING ( </pre> This produces a more accurate result than the linear case for this small data set: <pre class="result"> - id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target + id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target ----+------+---------+------+--------+------+-------+------------+-------------------+-------- 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | t | 1.00338548176312 | t 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | t | 1.00000000098154 | t @@ -709,7 +709,7 @@ SELECT * FROM houses JOIN houses_regr USING (id) ORDER BY id; </pre> Result for the linear regression model: <pre class="result"> - id | tax | bedroom | bath | price | size | lot | prediction | decision_function + id | tax | bedroom | bath | price | size | lot | prediction | decision_function ----+------+---------+------+--------+------+-------+------------------+------------------- 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | 55288.6992755623 | 55288.6992755623 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | 99978.8137019119 | 99978.8137019119 @@ -733,7 +733,7 @@ DROP TABLE IF EXISTS houses_gaussian_regr; SELECT madlib.svm_predict('houses_svm_gaussian_regression', 'houses', 'id', 'houses_gaussian_regr'); SELECT * FROM houses JOIN houses_gaussian_regr USING (id) ORDER BY id; </pre> --# For the novelty detection using one-class, let's create a test data set using +-# For the novelty detection using one-class, let's create a test data set using the last 3 values from the training set plus an outlier at the end (10x price): <pre class="example"> DROP TABLE IF EXISTS houses_one_class_test; @@ -754,7 +754,7 @@ SELECT * FROM houses_one_class_test JOIN houses_one_class_pred USING (id) ORDER </pre> Result showing the last row predicted to be novel: <pre class="result"> - id | tax | bedroom | bath | price | size | lot | prediction | decision_function + id | tax | bedroom | bath | price | size | lot | prediction | decision_function ----+------+---------+------+--------+------+-------+------------+--------------------- 1 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | 1 | 0.111497008121437 2 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | 1 | 0.0996021345169148 @@ -938,6 +938,86 @@ CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_step( INITCOND='{0,0,0,0,0,0,0}' ); +-------------------------------------------------------------------------- +-- create SQL functions for IGD optimizer +-------------------------------------------------------------------------- +-- cannot be labeled as STRICT because we set previous_state NULL initially +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_minibatch_transition( + state double precision[], + ind_var double precision[][], + dep_var double precision[], + previous_state double precision[], + dimension integer, + stepsize double precision, + reg double precision, + is_l2 boolean, + n_tuples integer, + epsilon double precision, + is_svc boolean, + tuple_weight double precision, + batch_size integer, + n_epochs integer +) +RETURNS double precision[] AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_minibatch_merge( + state1 double precision[], + state2 double precision[]) +RETURNS double precision[] AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_minibatch_final( + state double precision[]) +RETURNS double precision[] AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); + +/** + * @internal + * @brief Perform one iteration of the incremental gradient + * method for computing linear support vector machine + */ +DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linear_svm_igd_minibatch_step( + /*+ ind_var */ double precision[][], + /*+ dep_var */ double precision[], + /*+ previous_state */ double precision[], + /*+ dimension */ integer, + /*+ stepsize */ double precision, + /*+ reg */ double precision, + /*+ is_l2 */ boolean, + /*+ n_tuples */ integer, + /*+ epsilon */ double precision, + /*+ is_svc */ boolean, + /*+ tuple_weight */ double precision, + /*+ batch_size */ integer, + /*+ n_epochs */ integer +); +CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_minibatch_step( + /*+ ind_var */ double precision[][], + /*+ dep_var */ double precision[], + /*+ previous_state */ double precision[], + /*+ dimension */ integer, + /*+ stepsize */ double precision, + /*+ reg */ double precision, + /*+ is_l2 */ boolean, + /*+ n_tuples */ integer, + /*+ epsilon */ double precision, + /*+ is_svc */ boolean, + /*+ tuple_weight */ double precision, + /*+ batch_size */ integer, + /*+ n_epochs */ integer + ) ( + STYPE=double precision[], + SFUNC=MADLIB_SCHEMA.linear_svm_igd_minibatch_transition, + m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.linear_svm_igd_minibatch_merge,') + FINALFUNC=MADLIB_SCHEMA.linear_svm_igd_minibatch_final, + INITCOND='{0,0,0,0,0,0,0,0,0}' +); + + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_distance( /*+ state1 */ double precision[], /*+ state2 */ double precision[]) @@ -945,12 +1025,25 @@ RETURNS double precision AS 'MODULE_PATHNAME' LANGUAGE c IMMUTABLE STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_minibatch_distance( + /*+ state1 */ double precision[], + /*+ state2 */ double precision[]) +RETURNS double precision AS 'MODULE_PATHNAME' +LANGUAGE c IMMUTABLE STRICT +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_result( /*+ state */ double precision[]) RETURNS MADLIB_SCHEMA.linear_svm_result AS 'MODULE_PATHNAME' LANGUAGE c IMMUTABLE STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_minibatch_result( + /*+ state */ double precision[]) +RETURNS MADLIB_SCHEMA.linear_svm_result AS 'MODULE_PATHNAME' +LANGUAGE c IMMUTABLE STRICT +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( source_table text, http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/ports/postgres/modules/svm/test/svm.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/svm/test/svm.sql_in b/src/ports/postgres/modules/svm/test/svm.sql_in index 60d280e..d9e9383 100644 --- a/src/ports/postgres/modules/svm/test/svm.sql_in +++ b/src/ports/postgres/modules/svm/test/svm.sql_in @@ -903,3 +903,107 @@ SELECT 'The dimension of the coefficients must be equal to n_components (3)!') FROM m9; */ + + +-- minibatch ----------------------------------------------------------- +drop table if exists svm_minibatch_train; +CREATE TABLE svm_minibatch_train ( + id integer, + x double precision[][], + rings integer[], + sex text[] +); + +COPY svm_minibatch_train (id, x, rings, sex) FROM stdin DELIMITER '|'; +0|{{0.53,0.42,0.17,0.828,0.41,0.208,0.1505},{0.27,0.195,0.06,0.073,0.0285,0.0235,0.03},{0.31,0.23,0.07,0.1245,0.0505,0.0265,0.038},{0.36,0.27,0.085,0.2185,0.1065,0.038,0.062},{0.32,0.24,0.08,0.18,0.08,0.0385,0.055},{0.3,0.22,0.08,0.121,0.0475,0.042,0.035},{0.5,0.39,0.135,0.6595,0.3145,0.1535,0.1565},{0.295,0.215,0.07,0.121,0.047,0.0155,0.0405},{0.375,0.28,0.08,0.226,0.105,0.047,0.065}}|{6,5,6,6,6,5,6,6,6}|{F,M,F,F,F,M,F,F,F} +1|{{0.415,0.31,0.105,0.3595,0.167,0.083,0.0915},{0.35,0.25,0.07,0.1605,0.0715,0.0335,0.046},{0.415,0.33,0.09,0.3595,0.17,0.081,0.09},{0.66,0.475,0.18,1.3695,0.641,0.294,0.335},{0.415,0.31,0.09,0.2815,0.1245,0.0615,0.085},{0.35,0.265,0.09,0.2265,0.0995,0.0575,0.065},{0.215,0.155,0.06,0.0525,0.021,0.0165,0.015},{0.35,0.27,0.075,0.215,0.1,0.036,0.065},{0.255,0.18,0.065,0.079,0.034,0.014,0.025},{0.28,0.22,0.08,0.1315,0.066,0.024,0.03}}|{6,6,6,6,6,6,5,6,5,5}|{F,F,F,F,F,F,M,F,M,M} +2|{{0.27,0.19,0.08,0.081,0.0265,0.0195,0.03},{0.375,0.29,0.095,0.2875,0.123,0.0605,0.08},{0.27,0.2,0.08,0.1205,0.0465,0.028,0.04},{0.235,0.175,0.065,0.0615,0.0205,0.02,0.019},{0.24,0.17,0.05,0.0545,0.0205,0.016,0.0155},{0.34,0.255,0.085,0.204,0.097,0.021,0.05},{0.275,0.22,0.08,0.1365,0.0565,0.0285,0.042},{0.385,0.28,0.09,0.228,0.1025,0.042,0.0655},{0.355,0.27,0.075,0.1775,0.079,0.0315,0.054},{0.27,0.205,0.05,0.084,0.03,0.0185,0.029}}|{6,6,6,6,5,6,6,5,6,6}|{F,F,F,F,M,F,F,M,F,F} +3|{{0.335,0.26,0.085,0.192,0.097,0.03,0.054},{0.26,0.215,0.08,0.099,0.037,0.0255,0.045},{0.315,0.21,0.06,0.125,0.06,0.0375,0.035},{0.585,0.45,0.125,0.874,0.3545,0.2075,0.225},{0.44,0.345,0.13,0.4495,0.209,0.0835,0.134},{0.315,0.23,0.08,0.1375,0.0545,0.031,0.0445},{0.35,0.25,0.07,0.18,0.0655,0.048,0.054},{0.31,0.24,0.105,0.2885,0.118,0.065,0.083},{0.325,0.23,0.09,0.147,0.06,0.034,0.045},{0.28,0.21,0.075,0.1195,0.053,0.0265,0.03}}|{6,5,5,6,6,5,6,6,4,6}|{F,M,M,F,F,M,F,F,F,F} +4|{{0.41,0.31,0.09,0.3335,0.1635,0.061,0.091},{0.595,0.475,0.16,1.1405,0.547,0.231,0.271},{0.385,0.3,0.09,0.247,0.1225,0.044,0.0675},{0.455,0.335,0.105,0.422,0.229,0.0865,0.1},{0.245,0.18,0.065,0.0635,0.0245,0.0135,0.02},{0.155,0.115,0.025,0.024,0.009,0.005,0.0075},{0.28,0.215,0.08,0.132,0.072,0.022,0.033},{0.335,0.25,0.08,0.1695,0.0695,0.044,0.0495},{0.175,0.125,0.05,0.0235,0.008,0.0035,0.008},{0.275,0.205,0.075,0.1105,0.045,0.0285,0.035}}|{6,6,5,6,4,5,5,6,5,6}|{F,F,M,F,F,M,M,F,M,F} +5|{{0.41,0.325,0.1,0.394,0.208,0.0655,0.106},{0.4,0.295,0.095,0.252,0.1105,0.0575,0.066},{0.26,0.2,0.07,0.092,0.037,0.02,0.03},{0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195},{0.255,0.185,0.07,0.075,0.028,0.018,0.025},{0.385,0.3,0.115,0.3435,0.1645,0.085,0.1025},{0.325,0.27,0.1,0.185,0.08,0.0435,0.065},{0.28,0.205,0.1,0.1165,0.0545,0.0285,0.03},{0.275,0.2,0.065,0.092,0.0385,0.0235,0.027},{0.38,0.275,0.095,0.2505,0.0945,0.0655,0.075}}|{6,6,6,6,6,6,6,5,5,6}|{F,F,F,F,F,F,F,M,M,F} +6|{{0.365,0.255,0.08,0.1985,0.0785,0.0345,0.053},{0.175,0.135,0.04,0.0305,0.011,0.0075,0.01},{0.515,0.375,0.11,0.6065,0.3005,0.131,0.15},{0.23,0.18,0.05,0.064,0.0215,0.0135,0.02},{0.185,0.135,0.04,0.027,0.0105,0.0055,0.009},{0.33,0.24,0.075,0.163,0.0745,0.033,0.048},{0.37,0.265,0.075,0.214,0.09,0.051,0.07},{0.325,0.245,0.07,0.161,0.0755,0.0255,0.045},{0.19,0.13,0.045,0.0265,0.009,0.005,0.009},{0.325,0.245,0.075,0.1495,0.0605,0.033,0.045}}|{5,5,6,5,5,6,6,6,5,5}|{M,M,F,M,M,F,F,F,M,M} +7|{{0.44,0.34,0.105,0.369,0.164,0.08,0.1015},{0.27,0.195,0.08,0.1,0.0385,0.0195,0.03},{0.32,0.235,0.08,0.1485,0.064,0.031,0.045},{0.53,0.41,0.14,0.681,0.3095,0.1415,0.1835},{0.405,0.285,0.09,0.2645,0.1265,0.0505,0.075},{0.45,0.33,0.11,0.3685,0.16,0.0885,0.102},{0.245,0.175,0.055,0.0785,0.04,0.018,0.02},{0.38,0.275,0.095,0.2425,0.106,0.0485,0.21},{0.47,0.36,0.11,0.4965,0.237,0.127,0.13},{0.37,0.27,0.095,0.2175,0.097,0.046,0.065}}|{5,6,6,6,6,6,5,6,6,6}|{M,F,F,F,F,F,M,F,F,F} +8|{{0.35,0.265,0.08,0.192,0.081,0.0465,0.053},{0.28,0.2,0.075,0.1225,0.0545,0.0115,0.035},{0.385,0.3,0.1,0.2725,0.1115,0.057,0.08},{0.335,0.245,0.09,0.1665,0.0595,0.04,0.06},{0.285,0.21,0.07,0.109,0.044,0.0265,0.033},{0.415,0.305,0.1,0.325,0.156,0.0505,0.091},{0.43,0.335,0.105,0.378,0.188,0.0785,0.09},{0.175,0.125,0.04,0.024,0.0095,0.006,0.005},{0.4,0.315,0.085,0.2675,0.116,0.0585,0.0765},{0.375,0.285,0.09,0.2545,0.119,0.0595,0.0675}}|{6,5,6,6,5,6,6,4,6,6}|{F,M,F,F,M,F,F,F,F,F} +9|{{0.365,0.27,0.105,0.2155,0.0915,0.0475,0.063},{0.36,0.27,0.09,0.2075,0.098,0.039,0.062},{0.33,0.23,0.085,0.1695,0.079,0.026,0.0505},{0.34,0.26,0.085,0.1885,0.0815,0.0335,0.06},{0.285,0.215,0.075,0.106,0.0415,0.023,0.035},{0.195,0.145,0.05,0.032,0.01,0.008,0.012},{0.32,0.24,0.07,0.133,0.0585,0.0255,0.041},{0.495,0.4,0.155,0.8085,0.2345,0.1155,0.35},{0.475,0.36,0.12,0.5915,0.3245,0.11,0.127},{0.395,0.27,0.1,0.2985,0.1445,0.061,0.082}}|{6,6,6,6,5,4,6,6,6,5}|{F,F,F,F,M,F,F,F,F,M} +\. + +DROP TABLE IF EXISTS svm_minibatch_test; +CREATE TABLE svm_minibatch_test AS +SELECT id, + ARRAY[round(length::numeric, 4), + round(diameter::numeric, 4), + round(height::numeric, 4), + round(whole::numeric, 4), + round(shucked::numeric, 4), + round(viscera::numeric, 4), + round(shell::numeric, 4)] as x, + rings, + CASE WHEN sex = 'I' then 'F' else sex end as sex +FROM abalone_train_small_tmp; + +------ Regression------- +DROP TABLE IF EXISTS svm_minibatch_reg_out, svm_minibatch_reg_out_summary; +SELECT svm_regression( + 'svm_minibatch_train', + 'svm_minibatch_reg_out', + 'rings', + 'x', + 'linear', + NULL, + NULL, + 'max_iter=10, init_stepsize=0.2, batch_size=3, n_epochs=3' +); + +DROP TABLE IF EXISTS svm_predict_reg_minibatch_out; +SELECT svm_predict('svm_minibatch_reg_out', 'svm_minibatch_test', 'id', 'svm_predict_reg_minibatch_out'); + +SELECT assert(error < 1, 'Training error ' || error || ' with SVM regression minibatch is too high (>1)') +from (SELECT avg((rings-prediction)^2) as error +FROM svm_minibatch_test JOIN svm_predict_reg_minibatch_out +using (id)) q; + + +-- testing for batch_size bigger than datapoints ------------------------------ +-- setting batch_size = 30 with max of 11 rows in svm_minibatch_train +-- this should not error and just treat the whole matrix as a single batch +DROP TABLE IF EXISTS svm_minibatch_reg_out, svm_minibatch_reg_out_summary; +SELECT svm_regression( + 'svm_minibatch_train', + 'svm_minibatch_reg_out', + 'rings', + 'x', + 'linear', + NULL, + NULL, + 'max_iter=10, init_stepsize=0.2, batch_size=30, n_epochs=1' +); + +------ Classification ------- +DROP TABLE IF EXISTS svm_minibatch_classification_out, svm_minibatch_classification_out_summary; +\timing on +SELECT svm_classification( + 'svm_minibatch_train', + 'svm_minibatch_classification_out', + 'sex', + 'x', + 'linear', + NULL, + NULL, + 'max_iter=10, init_stepsize=0.2, batch_size=3, n_epochs=2' +); +\timing off + +DROP TABLE IF EXISTS svm_predict_classification_minibatch_out; +SELECT svm_predict('svm_minibatch_classification_out', + 'svm_minibatch_test', + 'id', + 'svm_predict_classification_minibatch_out'); + +SELECT assert(accuracy >= 0.70, + 'Training accuracy '|| accuracy || + ' with SVM classification minibatch is too low (<0.7)') +FROM (SELECT count(*)/99. as accuracy + FROM svm_minibatch_test JOIN svm_predict_classification_minibatch_out + USING (id) + WHERE sex = prediction + ) q; http://git-wip-us.apache.org/repos/asf/madlib/blob/657cf4aa/src/ports/postgres/modules/utilities/validate_args.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in index 2b9c6d7..a2f43fd 100644 --- a/src/ports/postgres/modules/utilities/validate_args.py_in +++ b/src/ports/postgres/modules/utilities/validate_args.py_in @@ -475,9 +475,9 @@ def array_col_dimension(tbl, col): if col is None: plpy.error('Input error: Column name is invalid') dim = plpy.execute(""" - SELECT max(array_upper({col}, 1)) AS dim - FROM {tbl} - """.format(col=col, tbl=tbl))[0]["dim"] + SELECT max(array_upper({col}, 1)) AS dim + FROM {tbl} + """.format(col=col, tbl=tbl))[0]["dim"] return dim # ------------------------------------------------------------------------ @@ -491,15 +491,12 @@ def array_col_has_same_dimension(tbl, col): if col is None or col.lower() == 'null': plpy.error('Input error: Column name is invalid') - max_dim = plpy.execute(""" - SELECT max(array_upper({col}, 1)) AS max_dim - FROM {tbl} - """.format(col=col, tbl=tbl))[0]["max_dim"] - min_dim = plpy.execute(""" - SELECT min(array_upper({col}, 1)) AS min_dim + results = plpy.execute(""" + SELECT min(array_upper({col}, 1)) AS min_dim, + max(array_upper({col}, 1)) AS max_dim FROM {tbl} - """.format(col=col, tbl=tbl))[0]["min_dim"] - return max_dim == min_dim + """.format(col=col, tbl=tbl))[0] + return results['max_dim'] == results['min_dim'] # ------------------------------------------------------------------------