This is an automated email from the git hooks/post-receive script. sebastic-guest pushed a commit to branch upstream-master in repository pktools.
commit 094c17f13074e221f93e7c4f4e560952c79e7f82 Author: Pieter Kempeneers <kempe...@gmail.com> Date: Wed Jan 2 18:18:19 2013 +0100 added feature selection --- ChangeLog | 3 + src/algorithms/FeatureSelector.h | 777 +++++++++++++++++++++++++++++++++++++++ src/algorithms/Makefile.am | 4 +- src/algorithms/Makefile.in | 14 +- src/apps/Makefile.am | 3 +- src/apps/Makefile.in | 27 +- src/apps/pkclassify_svm.cc | 5 +- src/apps/pkfs_svm.cc | 549 +++++++++++++++++++++++++++ 8 files changed, 1362 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3cfd117..6364779 100755 --- a/ChangeLog +++ b/ChangeLog @@ -61,6 +61,9 @@ version 2.4 options min and max are now set with -min (--min) and -max (--max) - pkclassify_svm do not output input file if no input data was defined in verbose mode + update of header information + - pkfs_svm + feature selection tool for svm classification (added) - pkascii2ogr tool to create simple vector files from coordinates in ASCII file (points or polygon) - pksensormodel diff --git a/src/algorithms/FeatureSelector.h b/src/algorithms/FeatureSelector.h new file mode 100644 index 0000000..503dfac --- /dev/null +++ b/src/algorithms/FeatureSelector.h @@ -0,0 +1,777 @@ +/********************************************************************** +FeatureSelector.h: select features, typical use: feature selection for classification +Copyright (C) 2008-2012 Pieter Kempeneers + +This file is part of pktools + +pktools is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +pktools is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with pktools. If not, see <http://www.gnu.org/licenses/>. +***********************************************************************/ +#ifndef _FEATURESELECTOR_H_ +#define _FEATURESELECTOR_H_ + +#include <math.h> +#include <vector> +#include <list> +#include <map> +#include <algorithm> +#include <iostream> +#include <iomanip> +#include "PosValue.h" +#include "Vector2d.h" +#include "gsl/gsl_combination.h" + +using namespace std; + +class FeatureSelector +{ + public: + FeatureSelector(){}; + ~FeatureSelector(){}; + template<class T> double forwardUnivariate(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int maxFeatures, double prior1=0.5, double prior2=0.5, bool verbose=false); + template<class T> double forwardUnivariate(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int maxFeatures, bool verbose=false); + template<class T> double forwardUnivariate(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, int maxFeatures, bool verbose=false); + template<class T> double forward(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int maxFeatures, double prior1=0.5, double prior2=0.5, bool verbose=false); + template<class T> double forward(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int maxFeatures, bool verbose=false); + template<class T> double forward(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, int maxFeatures, bool verbose=false); + template<class T> double backward(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int minFeatures, bool verbose=false); + template<class T> double backward(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int minFeatures, double prior1=0.5, double prior2=0.5,bool verbose=false); + template<class T> double backward(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, int minFeatures, bool verbose=false); + template<class T> double floating(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int maxFeatures=0, bool verbose=false); + template<class T> double floating(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int maxFeatures=0, double prior1=0.5, double prior2=0.5,bool verbose=false); + template<class T> double floating(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, int maxFeatures, bool verbose=false); + template<class T> double bruteForce(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int maxFeatures=0, double prior1=0.5, double prior2=0.5, bool verbose=false); + + private: + template<class T> double addFeature(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, bool verbose=false); + template<class T> double addFeature(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, double prior1=0.5, double prior2=0.5, bool verbose=false); + template<class T> double addFeature(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, bool verbose=false); + template<class T> double removeFeature(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int& r, bool verbose=false); + template<class T> double removeFeature(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset,int& r, double prior1=0.5, double prior2=0.5, bool verbose=false); + template<class T> double removeFeature(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset,int& r, bool verbose=false); +}; + +//sequential forward selection Univariate (N single best features) +template<class T> double FeatureSelector::forwardUnivariate(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int maxFeatures, double prior1, double prior2, bool verbose){ + int maxLevels=v1[0].size(); + if(!maxFeatures) + maxFeatures=maxLevels; + int k=subset.size(); + if(k>=maxFeatures) + return -1; + vector<PosValue> cost(maxLevels); + list<int> tmpset=subset;//temporary set of selected features (levels) + Vector2d<T> tmp1(v1.size()); + Vector2d<T> tmp2(v2.size()); + for(int ilevel=0;ilevel<maxLevels;++ilevel){ + if(find(tmpset.begin(),tmpset.end(),ilevel)==tmpset.end()){ + tmpset.push_back(ilevel); + v1.selectCols(tmpset,tmp1); + v2.selectCols(tmpset,tmp2); + try{ + PosValue pv; + pv.position=ilevel; + pv.value=getCost(tmp1,tmp2,prior1,prior2); + cost[ilevel]=pv; + } + catch(...){ + PosValue pv; + pv.position=ilevel; + pv.value=-1; + cost[ilevel]=pv; + } + tmpset.pop_back(); + } + } + sort(cost.begin(),cost.end(),Compare_PosValue());//decreasing order + int ilevel=0; + while((subset.size()<maxFeatures)&&(ilevel<maxLevels)){ + if(cost[ilevel].value>0) + subset.push_back(cost[ilevel].position); + if(verbose) + cout << "feature " << subset.back() << " has cost: " << cost[ilevel].value << endl; + ++ilevel; + } + double maxCost=-1; + while(subset.size()){ + v1.selectCols(subset,tmp1); + v2.selectCols(subset,tmp2); + try{ + maxCost=getCost(tmp1,tmp2,prior1,prior2); + } + catch(...){ + subset.pop_back(); + continue; + } + return maxCost; + } +} + +template<class T> double FeatureSelector::forwardUnivariate(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int maxFeatures=0, bool verbose){ + int maxLevels=v[0][0].size(); + if(!maxFeatures) + maxFeatures=maxLevels; + int k=subset.size(); + if(k>=maxFeatures) + return -1; + vector<PosValue> cost(maxLevels); + list<int> tmpset=subset;//temporary set of selected features (levels) + vector< Vector2d<T> > tmp(v.size()); + for(int ilevel=0;ilevel<maxLevels;++ilevel){ + if(find(tmpset.begin(),tmpset.end(),ilevel)==tmpset.end()){ + tmpset.push_back(ilevel); + for(int iclass=0;iclass<v.size();++iclass){ +// tmp[iclass].resize(v[iclass].size()); + v[iclass].selectCols(tmpset,tmp[iclass]); + } + try{ + PosValue pv; + pv.position=ilevel; + pv.value=getCost(tmp); + cost[ilevel]=pv; + } + catch(...){ + PosValue pv; + pv.position=ilevel; + pv.value=-1; + cost[ilevel]=pv; + } + tmpset.pop_back(); + } + } + sort(cost.begin(),cost.end(),Compare_PosValue());//decreasing order + int ilevel=0; + while((subset.size()<maxFeatures)&&(ilevel<maxLevels)){ + if(cost[ilevel].value>0) + subset.push_back(cost[ilevel].position); + if(verbose) + cout << "feature " << subset.back() << " has cost: " << cost[ilevel].value << endl; + ++ilevel; + } + double maxCost=-1; + while(subset.size()){ + for(int iclass=0;iclass<v.size();++iclass){ +// tmp[iclass].resize(v[iclass].size()); + v[iclass].selectCols(subset,tmp[iclass]); + } + try{ + maxCost=getCost(tmp); + } + catch(...){ + subset.pop_back(); + continue; + } + return maxCost; + } +} + +//sequential forward selection Multivariate (Combination of N best features) +template<class T> double FeatureSelector::forward(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int maxFeatures, double prior1, double prior2, bool verbose){ + //Select feature with the best value (get maximal cost for 1 feature) + double maxCost=0; + int maxLevels=v1[0].size(); + if(!maxFeatures) + maxFeatures=maxLevels; + while(subset.size()<maxFeatures){ + maxCost=addFeature(v1,v2,*getCost,subset,verbose); + if(verbose) + cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << maxCost << endl; + }//while + return maxCost; +} + + +//sequential forward selection Multivariate (Combination of N best features) +template<class T> double FeatureSelector::forward(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int maxFeatures=0, bool verbose){ + //Select feature with the best value (get maximal cost for 1 feature) + double maxCost=0; + int maxLevels=v[0][0].size(); + if(!maxFeatures) + maxFeatures=maxLevels; + while(subset.size()<maxFeatures){ + maxCost=addFeature(v,*getCost,subset,verbose); + if(verbose) + cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << maxCost << endl; + }//while + return maxCost; +} + +template<class T> double FeatureSelector::forward(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, int maxFeatures, bool verbose) +{ + //Select feature with the best value (get maximal cost for 1 feature) + double maxCost=0; + int maxLevels=(v.begin()->second)[0].size(); + if(!maxFeatures) + maxFeatures=maxLevels; + while(subset.size()<maxFeatures){ + maxCost=addFeature(v,y,*getCost,subset,verbose); + if(verbose) + cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << maxCost << endl; + }//while + return maxCost; +} + +//sequential backward selection +template<class T> double FeatureSelector::backward(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int minFeatures, double prior1, double prior2, bool verbose){ + //Select features with least effect on cost when removed (obtain minFeatures eventually) + double maxCost=0; + int removedFeature; + if(subset.empty()){ + for(int iFeature=0;iFeature<v1[0].size();++iFeature) + subset.push_back(iFeature); + }//if + if(subset.size()==minFeatures) + maxCost=getCost(v1,v2,prior1,prior2); + while(subset.size()>minFeatures){ + maxCost=removeFeature(v1,v2,*getCost,subset,removedFeature,verbose); + if(verbose) + cout << "removed " << removedFeature << ", " << subset.size() << "/" << minFeatures << " features remain with cost: " << maxCost << endl; + }//while + return maxCost; +} + +//sequential backward selection +template<class T> double FeatureSelector::backward(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int minFeatures, bool verbose){ + //Select features with least effect on cost when removed (obtain minFeatures eventually) + double maxCost=0; + int removedFeature; + if(subset.empty()){ + for(int iFeature=0;iFeature<v[0][0].size();++iFeature) + subset.push_back(iFeature); + }//if + if(subset.size()==minFeatures) + maxCost=getCost(v); + while(subset.size()>minFeatures){ + maxCost=removeFeature(v,*getCost,subset,removedFeature,verbose); + if(verbose) + cout << "removed " << removedFeature << ", " << subset.size() << "/" << minFeatures << " features remain with cost: " << maxCost << endl; + }//while + return maxCost; +} + +template<class T> double FeatureSelector::backward(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, int minFeatures, bool verbose){ + //Select features with least effect on cost when removed (obtain minFeatures eventually) + double maxCost=0; + int removedFeature; + if(subset.empty()){ + for(int iFeature=0;iFeature<(v.begin()->second)[0].size();++iFeature) + subset.push_back(iFeature); + }//if + if(subset.size()==minFeatures) + maxCost=getCost(v,y); + while(subset.size()>minFeatures){ + maxCost=removeFeature(v,y,*getCost,subset,removedFeature,verbose); + if(verbose) + cout << "removed " << removedFeature << ", " << subset.size() << "/" << minFeatures << " features remain with cost: " << maxCost << endl; + }//while + return maxCost; +} + +//floating search +template<class T> double FeatureSelector::floating(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int maxFeatures, double prior1, double prior2, bool verbose){ + vector<T> cost; + int maxLevels=v1[0].size(); + if(maxFeatures<1) + maxFeatures=maxLevels; +// int k=0; + int initialK=subset.size(); + int k=initialK; + int stopK=initialK+1; + if(initialK>=maxFeatures) + return -1; + cost.push_back(-1);//init 0 features as cost -1 + cost.push_back(addFeature(v1,v2,*getCost,subset,verbose)); + ++k; + assert(subset.size()==k); + if(verbose) + cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << cost.back() << endl; + while(k<maxFeatures){ + cost.push_back(addFeature(v1,v2,*getCost,subset,verbose)); + ++k; + assert(subset.size()==k); + if(verbose) + cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << cost.back() << endl; +// while(k>1){ + while(k>stopK){ + int x_r; + double cost_r=removeFeature(v1,v2,*getCost,subset,x_r,verbose); + if(cost_r>cost[k-1]){ + --k; + assert(subset.size()==k); + cost[k]=cost_r; + cost.pop_back(); + if(verbose) + cout << "removed " << x_r << ", " << subset.size() << "/" << maxFeatures << " features remain with cost: " << cost_r << endl; + continue; + } + else if(cost_r>=0){ + subset.push_back(x_r); + break; + } + else if(verbose) + cout << "could not remove any feature" << endl; + cost.pop_back(); + } + } + return cost.back(); +} + +//floating search +template<class T> double FeatureSelector::floating(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int maxFeatures, bool verbose){ + vector<T> cost; + int maxLevels=v[0][0].size(); + if(maxFeatures<1) + maxFeatures=maxLevels; + int k=subset.size(); + if(k>=maxFeatures) + return -1; + cost.push_back(-1);//init 0 features as cost -1 + cost.push_back(addFeature(v,*getCost,subset,verbose)); + ++k; + if(verbose) + cout << "added " << subset.back() << ", " << cost.size()-1 << "/" << maxFeatures << " features selected with cost: " << cost.back() << endl; + while(k<maxFeatures){ + cost.push_back(addFeature(v,*getCost,subset,verbose)); + ++k; + if(verbose) + cout << "added " << subset.back() << ", " << cost.size()-1 << "/" << maxFeatures << " features selected with cost: " << cost.back() << endl; + while(k>1){ + int x_r; + double cost_r=removeFeature(v,*getCost,subset,x_r,verbose); + if(cost_r>cost[k-1]){ + --k; + cost[k]=cost_r; + cost.pop_back(); + if(verbose) + cout << "removed " << x_r << ", " << cost.size()-1 << "/" << maxFeatures << " features remain with cost: " << cost_r << endl; + continue; + } + else if(cost_r>=0){ + subset.push_back(x_r); + break; + } + else if(verbose) + cout << "could not remove any feature" << endl; + cost.pop_back(); + } + } + return cost.back(); +} + +//floating search +template<class T> double FeatureSelector::floating(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, int maxFeatures, bool verbose){ + vector<T> cost; + int maxLevels=(v.begin()->second)[0].size(); + if(maxFeatures<1) + maxFeatures=maxLevels; + int k=subset.size(); + if(k>=maxFeatures) + return -1; + cost.push_back(-1);//init 0 features as cost -1 + cost.push_back(addFeature(v,y,*getCost,subset,verbose)); + ++k; + if(verbose) + cout << "added " << subset.back() << ", " << cost.size()-1 << "/" << maxFeatures << " features selected with cost: " << cost.back() << endl; + while(k<maxFeatures){ + cost.push_back(addFeature(v,y,*getCost,subset,verbose)); + ++k; + if(verbose) + cout << "added " << subset.back() << ", " << cost.size()-1 << "/" << maxFeatures << " features selected with cost: " << cost.back() << endl; + while(k>1){ + int x_r; + double cost_r=removeFeature(v,y,*getCost,subset,x_r,verbose); + if(cost_r>cost[k-1]){ + --k; + cost[k]=cost_r; + cost.pop_back(); + if(verbose) + cout << "removed " << x_r << ", " << cost.size()-1 << "/" << maxFeatures << " features remain with cost: " << cost_r << endl; + continue; + } + else if(cost_r>=0){ + subset.push_back(x_r); + break; + } + else if(verbose) + cout << "could not remove any feature" << endl; + cost.pop_back(); + } + } + return cost.back(); +} + +//brute force search (search for all possible combinations and select best) +template<class T> double FeatureSelector::bruteForce(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int maxFeatures, double prior1, double prior2, bool verbose){ + int maxLevels=v1[0].size(); + if(maxFeatures<1) + maxFeatures=maxLevels; + int k=subset.size(); + if(k>=maxFeatures) + return -1; +// gslmm::combination c(v1[0].size(),maxFeatures,false); + gsl_combination *c; + c=gsl_combination_calloc(v1[0].size(),maxFeatures); + + list<int> tmpset;//temporary set of selected features (levels) + Vector2d<T> tmp1(v1.size()); + Vector2d<T> tmp2(v2.size()); + list<int> catchset;//restore set in case of catch all the way to last level (no better cost) + //initialize maxCost with actual cost for current subset (-1 if empty subset) + double maxCost=-1; + double cost; + if(subset.size()>=maxLevels) + return maxCost; +// c.first(); + gsl_combination_next(c); + do{ + for(int ifeature=0;ifeature<maxFeatures;++ifeature) + tmpset.push_back(c->data[ifeature]); + v1.selectCols(tmpset,tmp1); + v2.selectCols(tmpset,tmp2); + try{ + cost=getCost(tmp1,tmp2,prior1,prior2); + } + catch(...){ //singular matrix encountered + catchset=tmpset;//this tmpset resulted in failure of getCost + if(verbose){ + cout << "Could not get cost from set: " << endl; + gsl_combination_fprintf(stdout,c," %u"); + printf("\n"); +// c.print(stdout); + } + tmpset.clear(); + continue; + } + if(maxCost<cost){ //set with better cost is found + maxCost=cost; + subset=tmpset; + } + tmpset.clear(); + }while(gsl_combination_next(c)==GSL_SUCCESS); + gsl_combination_free(c); +// }while(c.next()); + if(maxCost<0) //no level added to better maxCost than current subset (catchset) + subset=catchset; + //test: assert list contains no duplicate elements + for(list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){ + list<int>::iterator lit2=lit;//start searching from next element + assert(find(++lit2,subset.end(),*lit)==subset.end()); + } + return maxCost; +} + +template<class T> double FeatureSelector::addFeature(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, double prior1, double prior2, bool verbose){ + //select feature with the best value (get maximal cost for 1 feature) + list<int> tmpset=subset;//temporary set of selected features (levels) + list<int> catchset;//restore set in case of catch all the way to last level (no better cost) + //initialize maxCost with actual cost for current subset (-1 if empty subset) + double maxCost=-1; +// if(subset.size()){ +// try{ +// v1.selectCols(subset,tmp1); +// v2.selectCols(subset,tmp2); +// maxCost=getCost(tmp1,tmp2); +// } +// catch(double determinant){ +// return -1; +// } +// } + double cost; + int maxLevels=v1[0].size(); + if(subset.size()>=maxLevels) + return maxCost; + for(int ilevel=0;ilevel<maxLevels;++ilevel){ + Vector2d<T> tmp1(v1.size()); + Vector2d<T> tmp2(v2.size()); + if(find(tmpset.begin(),tmpset.end(),ilevel)!=tmpset.end()) + continue; + tmpset.push_back(ilevel); + v1.selectCols(tmpset,tmp1); + v2.selectCols(tmpset,tmp2); + try{ + cost=getCost(tmp1,tmp2,prior1,prior2); + } + catch(...){ + catchset=tmpset;//this tmpset resulted in singular matrix + if(verbose) + cout << "Could not add feature " << tmpset.back() << endl; + tmpset.pop_back(); + continue; + } + if(maxCost<cost){ //level with better cost is found + maxCost=cost; + subset=tmpset; + } + tmpset.pop_back(); + } + if(maxCost<0) //no level added to better maxCost than current subset (catchset) + subset=catchset; + //test: assert list contains no duplicate elements + for(list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){ + list<int>::iterator lit2=lit;//start searching from next element + assert(find(++lit2,subset.end(),*lit)==subset.end()); + } + return maxCost; +} + +template<class T> double FeatureSelector::addFeature(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, bool verbose){ + //select feature with the best value (get maximal cost for 1 feature) + list<int> tmpset=subset;//temporary set of selected features (levels) + vector< Vector2d<T> > tmp(v.size()); + list<int> catchset;//restore set in case of catch all the way to last level (no better cost) + //initialize maxCost with actual cost for current subset (-1 if empty subset) + double maxCost=-1; + double cost; + int maxLevels=v[0][0].size(); + if(subset.size()>=maxLevels) + return maxCost; + for(int ilevel=0;ilevel<maxLevels;++ilevel){ + if(find(tmpset.begin(),tmpset.end(),ilevel)!=tmpset.end()) + continue; + tmpset.push_back(ilevel); + for(int iclass=0;iclass<v.size();++iclass){ +// tmp[iclass].resize(v[iclass].size()); + v[iclass].selectCols(tmpset,tmp[iclass]); + } + try{ + cost=getCost(tmp); + } + catch(...){ + catchset=tmpset;//this tmpset resulted in singular matrix + if(verbose) + cout << "Could not add feature " << tmpset.back() << endl; + tmpset.pop_back(); + continue; + } + if(maxCost<cost){ //level with better cost is found + maxCost=cost; + subset=tmpset; + } + tmpset.pop_back(); + } + if(maxCost<0) //no level added to better maxCost than current subset (catchset) + subset=catchset; + //test: assert list contains no duplicate elements + for(list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){ + list<int>::iterator lit2=lit;//start searching from next element + assert(find(++lit2,subset.end(),*lit)==subset.end()); + } + return maxCost; +} + +template<class T> double FeatureSelector::addFeature(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, bool verbose){ + //select feature with the best value (get maximal cost for 1 feature) + list<int> tmpset=subset;//temporary set of selected features (levels) + map<string, Vector2d<double> > tmp; + list<int> catchset;//restore set in case of catch all the way to last level (no better cost) + //initialize maxCost with actual cost for current subset (-1 if empty subset) + map<string, Vector2d<double> >::const_iterator mit; + double maxCost=-1; + double cost; + int maxLevels=(v.begin()->second)[0].size(); + if(subset.size()>=maxLevels) + return maxCost; + for(int ilevel=0;ilevel<maxLevels;++ilevel){ + if(find(tmpset.begin(),tmpset.end(),ilevel)!=tmpset.end()) + continue; + tmpset.push_back(ilevel); + for(mit=v.begin();mit!=v.end();++mit){ + string theName=mit->first; + v[theName].selectCols(tmpset,tmp[theName]); + } + try{ + cost=getCost(tmp,y); + } + catch(...){ + catchset=tmpset;//this tmpset resulted in singular matrix + if(verbose) + cout << "Could not add feature " << tmpset.back() << endl; + tmpset.pop_back(); + continue; + } + if(maxCost<cost){ //level with better cost is found + maxCost=cost; + subset=tmpset; + } + tmpset.pop_back(); + } + if(maxCost<0) //no level added to better maxCost than current subset (catchset) + subset=catchset; + //test: assert list contains no duplicate elements + for(list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){ + list<int>::iterator lit2=lit;//start searching from next element + assert(find(++lit2,subset.end(),*lit)==subset.end()); + } + return maxCost; +} + +template<class T> double FeatureSelector::removeFeature(Vector2d<T>& v1, Vector2d<T>& v2, double (*getCost)(const Vector2d<T>&, const Vector2d<T>&, double prior1, double prior2), list<int>& subset, int& r, double prior1, double prior2, bool verbose){ + //find the feature that has the least effect on the cost when it is removed from subset + list<int> tmpset=subset;//temporary set of selected features (levels) + Vector2d<T> tmp1(v1.size()); + Vector2d<T> tmp2(v2.size()); + int nFeatures=subset.size(); + list<int> catchset;//restore set in case of catch all the way to last level (no better cost) + //initialize maxCost with actual cost for current subset (-1 if empty subset) + double maxCost=-1; +// if(subset.size()){ +// try{ +// v1.selectCols(subset,tmp1); +// v2.selectCols(subset,tmp2); +// maxCost=getCost(tmp1,tmp2); +// } +// catch(double determinant){ +// return -1; +// } +// } + int last; + double cost; + int maxLevels=v1[0].size(); + if(subset.size()>maxLevels||subset.empty()){ + return maxCost; + } + list<int>::const_iterator it; + for(int i=0;i<nFeatures;++i){ + last=tmpset.back(); + tmpset.pop_back(); + v1.selectCols(tmpset,tmp1); + v2.selectCols(tmpset,tmp2); + try{ + cost=getCost(tmp1,tmp2,prior1,prior2); + } + catch(...){ + catchset=tmpset;//this tmpset resulted in singular matrix + if(verbose) + cout << "Could not remove feature " << last << endl; + tmpset.push_front(last); + continue; + } + if(maxCost<cost){ //level with better cost is found + maxCost=cost; + subset=tmpset; + r=last; + } + tmpset.push_front(last); + } + if(maxCost<0){//all levels removed were catched + subset=catchset; + } + //test: assert list contains no duplicate elements + for(list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){ + list<int>::iterator lit2=lit;//start searching from next element + assert(find(++lit2,subset.end(),*lit)==subset.end()); + } + return maxCost; +} + +template<class T> double FeatureSelector::removeFeature(vector< Vector2d<T> >& v, double (*getCost)(const vector< Vector2d<T> >&), list<int>& subset, int& r, bool verbose){ + //find the feature that has the least effect on the cost when it is removed from subset + list<int> tmpset=subset;//temporary set of selected features (levels) + vector< Vector2d<T> > tmp(v.size()); + int nFeatures=subset.size(); + list<int> catchset;//restore set in case of catch all the way to last level (no better cost) + //initialize maxCost with actual cost for current subset (-1 if empty subset) + double maxCost=-1; + int last; + double cost; + int maxLevels=v[0][0].size(); + if(subset.size()>maxLevels||subset.empty()){ + return maxCost; + } + list<int>::const_iterator it; + for(int i=0;i<nFeatures;++i){ + last=tmpset.back(); + tmpset.pop_back(); + for(int iclass=0;iclass<v.size();++iclass){ +// tmp[iclass].resize(v[iclass].size()); + v[iclass].selectCols(tmpset,tmp[iclass]); + } + try{ + cost=getCost(tmp); + } + catch(...){ + catchset=tmpset;//this tmpset resulted in singular matrix + if(verbose) + cout << "Could not remove feature " << last << endl; + tmpset.push_front(last); + continue; + } + if(maxCost<cost){ //level with better cost is found + maxCost=cost; + subset=tmpset; + r=last; + } + tmpset.push_front(last); + } + if(maxCost<0){//all levels removed were catched + subset=catchset; + } + //test: assert list contains no duplicate elements + for(list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){ + list<int>::iterator lit2=lit;//start searching from next element + assert(find(++lit2,subset.end(),*lit)==subset.end()); + } + return maxCost; +} + +template<class T> double FeatureSelector::removeFeature(map<string, Vector2d<T> > &v, map<string, vector<T> > &y, double (*getCost)(map<string, Vector2d<T> >&, map<string, vector<T> >&), list<int>& subset, int& r, bool verbose){ + //find the feature that has the least effect on the cost when it is removed from subset + list<int> tmpset=subset;//temporary set of selected features (levels) + map<string, Vector2d<double> > tmp; + map<string, Vector2d<double> >::const_iterator mit; + int nFeatures=subset.size(); + list<int> catchset;//restore set in case of catch all the way to last level (no better cost) + //initialize maxCost with actual cost for current subset (-1 if empty subset) + double maxCost=-1; + int last; + double cost; + int maxLevels=(v.begin()->second)[0].size(); + if(subset.size()>maxLevels||subset.empty()){ + return maxCost; + } + list<int>::const_iterator it; + for(int i=0;i<nFeatures;++i){ + last=tmpset.back(); + tmpset.pop_back(); + for(mit=v.begin();mit!=v.end();++mit){ + string theName=mit->first; + v[theName].selectCols(tmpset,tmp[theName]); + } + try{ + cost=getCost(tmp,y); + } + catch(...){ //singular matrix encountered + catchset=tmpset;//this tmpset resulted in singular matrix + if(verbose) + cout << "Could not remove feature " << last << endl; + tmpset.push_front(last); + continue; + } + if(maxCost<cost){ //level with better cost is found + maxCost=cost; + subset=tmpset; + r=last; + } + tmpset.push_front(last); + } + if(maxCost<0){//all levels removed were catched + subset=catchset; + } + //test: assert list contains no duplicate elements + for(list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){ + list<int>::iterator lit2=lit;//start searching from next element + assert(find(++lit2,subset.end(),*lit)==subset.end()); + } + return maxCost; +} +#endif /* _FEATURESELECTOR_H_ */ diff --git a/src/algorithms/Makefile.am b/src/algorithms/Makefile.am index ba7c6e3..9f0bafa 100644 --- a/src/algorithms/Makefile.am +++ b/src/algorithms/Makefile.am @@ -12,7 +12,7 @@ noinst_LIBRARIES = libalgorithms.a libalgorithms_adir = $(includedir)/algorithms # the list of header files that belong to the library (to be installed later) -libalgorithms_a_HEADERS = Egcs.h Filter2d.h Filter.h Histogram.h ConfusionMatrix.h svm.h +libalgorithms_a_HEADERS = Egcs.h Filter2d.h Filter.h Histogram.h ConfusionMatrix.h svm.h FeatureSelector.h if USE_FANN libalgorithms_a_HEADERS += myfann_cpp.h @@ -23,5 +23,5 @@ libalgorithms_a_HEADERS += SensorModel.h endif # the sources to add to the library and to add to the source distribution -libalgorithms_a_SOURCES = $(libalgorithms_a_HEADERS) Egcs.cc Filter2d.cc Filter.cc Histogram.cc ConfusionMatrix.cc svm.cpp +libalgorithms_a_SOURCES = $(libalgorithms_a_HEADERS) Egcs.cc Filter2d.cc Filter.cc Histogram.cc ConfusionMatrix.cc svm.cpp ############################################################################### diff --git a/src/algorithms/Makefile.in b/src/algorithms/Makefile.in index af8c667..a0c6645 100644 --- a/src/algorithms/Makefile.in +++ b/src/algorithms/Makefile.in @@ -53,9 +53,9 @@ ARFLAGS = cru libalgorithms_a_AR = $(AR) $(ARFLAGS) libalgorithms_a_LIBADD = am__libalgorithms_a_SOURCES_DIST = Egcs.h Filter2d.h Filter.h \ - Histogram.h ConfusionMatrix.h svm.h myfann_cpp.h SensorModel.h \ - Egcs.cc Filter2d.cc Filter.cc Histogram.cc ConfusionMatrix.cc \ - svm.cpp + Histogram.h ConfusionMatrix.h svm.h FeatureSelector.h \ + myfann_cpp.h SensorModel.h Egcs.cc Filter2d.cc Filter.cc \ + Histogram.cc ConfusionMatrix.cc svm.cpp am__objects_1 = am__objects_2 = $(am__objects_1) $(am__objects_1) am_libalgorithms_a_OBJECTS = $(am__objects_2) Egcs.$(OBJEXT) \ @@ -78,7 +78,8 @@ LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ SOURCES = $(libalgorithms_a_SOURCES) DIST_SOURCES = $(am__libalgorithms_a_SOURCES_DIST) am__libalgorithms_a_HEADERS_DIST = Egcs.h Filter2d.h Filter.h \ - Histogram.h ConfusionMatrix.h svm.h myfann_cpp.h SensorModel.h + Histogram.h ConfusionMatrix.h svm.h FeatureSelector.h \ + myfann_cpp.h SensorModel.h am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; am__vpath_adj = case $$p in \ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ @@ -228,10 +229,11 @@ libalgorithms_adir = $(includedir)/algorithms # the list of header files that belong to the library (to be installed later) libalgorithms_a_HEADERS = Egcs.h Filter2d.h Filter.h Histogram.h \ - ConfusionMatrix.h svm.h $(am__append_1) $(am__append_2) + ConfusionMatrix.h svm.h FeatureSelector.h $(am__append_1) \ + $(am__append_2) # the sources to add to the library and to add to the source distribution -libalgorithms_a_SOURCES = $(libalgorithms_a_HEADERS) Egcs.cc Filter2d.cc Filter.cc Histogram.cc ConfusionMatrix.cc svm.cpp +libalgorithms_a_SOURCES = $(libalgorithms_a_HEADERS) Egcs.cc Filter2d.cc Filter.cc Histogram.cc ConfusionMatrix.cc svm.cpp all: all-am .SUFFIXES: diff --git a/src/apps/Makefile.am b/src/apps/Makefile.am index eb3120e..14d8f95 100644 --- a/src/apps/Makefile.am +++ b/src/apps/Makefile.am @@ -6,7 +6,7 @@ LDADD = $(GDAL_LDFLAGS) $(top_builddir)/src/algorithms/libalgorithms.a $(top_bui ############################################################################### # the program to build (the names of the final binaries) -bin_PROGRAMS = pkinfo pkcrop pkreclass pkgetmask pksetmask pkcreatect pkdumpimg pkdumpogr pksieve pkstat pkstatogr pkegcs pkextract pkfillnodata pkfilter pkdsm2shadow pkmosaic pkndvi pkpolygonize pkascii2img pkdiff pkclassify_svm pkascii2ogr #pkxcorimg pkgeom +bin_PROGRAMS = pkinfo pkcrop pkreclass pkgetmask pksetmask pkcreatect pkdumpimg pkdumpogr pksieve pkstat pkstatogr pkegcs pkextract pkfillnodata pkfilter pkdsm2shadow pkmosaic pkndvi pkpolygonize pkascii2img pkdiff pkclassify_svm pkfs_svm pkascii2ogr #pkxcorimg pkgeom if USE_FANN bin_PROGRAMS += pkclassify_nn pkclassify_nn_SOURCES = $(top_srcdir)/src/algorithms/myfann_cpp.h pkclassify_nn.h pkclassify_nn.cc @@ -49,6 +49,7 @@ pkndvi_SOURCES = pkndvi.cc pkpolygonize_SOURCES = pkpolygonize.cc pkdiff_SOURCES = pkdiff.cc pkclassify_svm_SOURCES = $(top_srcdir)/src/algorithms/svm.h $(top_srcdir)/src/algorithms/svm.cpp pkclassify_svm.cc +pkfs_svm_SOURCES = $(top_srcdir)/src/algorithms/svm.h $(top_srcdir)/src/algorithms/svm.cpp pkfs_svm.cc pkascii2img_SOURCES = pkascii2img.cc pkascii2ogr_SOURCES = pkascii2ogr.cc #pkxcorimg_SOURCES= pkxcorimg.cc diff --git a/src/apps/Makefile.in b/src/apps/Makefile.in index 61c9158..184c40a 100644 --- a/src/apps/Makefile.in +++ b/src/apps/Makefile.in @@ -39,8 +39,8 @@ bin_PROGRAMS = pkinfo$(EXEEXT) pkcrop$(EXEEXT) pkreclass$(EXEEXT) \ pkextract$(EXEEXT) pkfillnodata$(EXEEXT) pkfilter$(EXEEXT) \ pkdsm2shadow$(EXEEXT) pkmosaic$(EXEEXT) pkndvi$(EXEEXT) \ pkpolygonize$(EXEEXT) pkascii2img$(EXEEXT) pkdiff$(EXEEXT) \ - pkclassify_svm$(EXEEXT) pkascii2ogr$(EXEEXT) $(am__EXEEXT_1) \ - $(am__EXEEXT_2) $(am__EXEEXT_3) + pkclassify_svm$(EXEEXT) pkfs_svm$(EXEEXT) pkascii2ogr$(EXEEXT) \ + $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) @USE_FANN_TRUE@am__append_1 = pkclassify_nn @USE_LAS_TRUE@am__append_2 = pklas2img @USE_NLOPT_TRUE@am__append_3 = pksensormodel @@ -149,6 +149,12 @@ pkfilter_LDADD = $(LDADD) pkfilter_DEPENDENCIES = $(am__DEPENDENCIES_1) \ $(top_builddir)/src/algorithms/libalgorithms.a \ $(top_builddir)/src/imageclasses/libimageClasses.a +am_pkfs_svm_OBJECTS = svm.$(OBJEXT) pkfs_svm.$(OBJEXT) +pkfs_svm_OBJECTS = $(am_pkfs_svm_OBJECTS) +pkfs_svm_LDADD = $(LDADD) +pkfs_svm_DEPENDENCIES = $(am__DEPENDENCIES_1) \ + $(top_builddir)/src/algorithms/libalgorithms.a \ + $(top_builddir)/src/imageclasses/libimageClasses.a am_pkgetmask_OBJECTS = pkgetmask.$(OBJEXT) pkgetmask_OBJECTS = $(am_pkgetmask_OBJECTS) pkgetmask_LDADD = $(LDADD) @@ -236,18 +242,18 @@ SOURCES = $(pkascii2img_SOURCES) $(pkascii2ogr_SOURCES) \ $(pkdsm2shadow_SOURCES) $(pkdumpimg_SOURCES) \ $(pkdumpogr_SOURCES) $(pkegcs_SOURCES) $(pkextract_SOURCES) \ $(pkfillnodata_SOURCES) $(pkfilter_SOURCES) \ - $(pkgetmask_SOURCES) $(pkinfo_SOURCES) $(pklas2img_SOURCES) \ - $(pkmosaic_SOURCES) $(pkndvi_SOURCES) $(pkpolygonize_SOURCES) \ - $(pkreclass_SOURCES) $(pksensormodel_SOURCES) \ - $(pksetmask_SOURCES) $(pksieve_SOURCES) $(pkstat_SOURCES) \ - $(pkstatogr_SOURCES) + $(pkfs_svm_SOURCES) $(pkgetmask_SOURCES) $(pkinfo_SOURCES) \ + $(pklas2img_SOURCES) $(pkmosaic_SOURCES) $(pkndvi_SOURCES) \ + $(pkpolygonize_SOURCES) $(pkreclass_SOURCES) \ + $(pksensormodel_SOURCES) $(pksetmask_SOURCES) \ + $(pksieve_SOURCES) $(pkstat_SOURCES) $(pkstatogr_SOURCES) DIST_SOURCES = $(pkascii2img_SOURCES) $(pkascii2ogr_SOURCES) \ $(am__pkclassify_nn_SOURCES_DIST) $(pkclassify_svm_SOURCES) \ $(pkcreatect_SOURCES) $(pkcrop_SOURCES) $(pkdiff_SOURCES) \ $(pkdsm2shadow_SOURCES) $(pkdumpimg_SOURCES) \ $(pkdumpogr_SOURCES) $(pkegcs_SOURCES) $(pkextract_SOURCES) \ $(pkfillnodata_SOURCES) $(pkfilter_SOURCES) \ - $(pkgetmask_SOURCES) $(pkinfo_SOURCES) \ + $(pkfs_svm_SOURCES) $(pkgetmask_SOURCES) $(pkinfo_SOURCES) \ $(am__pklas2img_SOURCES_DIST) $(pkmosaic_SOURCES) \ $(pkndvi_SOURCES) $(pkpolygonize_SOURCES) $(pkreclass_SOURCES) \ $(am__pksensormodel_SOURCES_DIST) $(pksetmask_SOURCES) \ @@ -398,6 +404,7 @@ pkndvi_SOURCES = pkndvi.cc pkpolygonize_SOURCES = pkpolygonize.cc pkdiff_SOURCES = pkdiff.cc pkclassify_svm_SOURCES = $(top_srcdir)/src/algorithms/svm.h $(top_srcdir)/src/algorithms/svm.cpp pkclassify_svm.cc +pkfs_svm_SOURCES = $(top_srcdir)/src/algorithms/svm.h $(top_srcdir)/src/algorithms/svm.cpp pkfs_svm.cc pkascii2img_SOURCES = pkascii2img.cc pkascii2ogr_SOURCES = pkascii2ogr.cc all: all-am @@ -513,6 +520,9 @@ pkfillnodata$(EXEEXT): $(pkfillnodata_OBJECTS) $(pkfillnodata_DEPENDENCIES) pkfilter$(EXEEXT): $(pkfilter_OBJECTS) $(pkfilter_DEPENDENCIES) @rm -f pkfilter$(EXEEXT) $(CXXLINK) $(pkfilter_OBJECTS) $(pkfilter_LDADD) $(LIBS) +pkfs_svm$(EXEEXT): $(pkfs_svm_OBJECTS) $(pkfs_svm_DEPENDENCIES) + @rm -f pkfs_svm$(EXEEXT) + $(CXXLINK) $(pkfs_svm_OBJECTS) $(pkfs_svm_LDADD) $(LIBS) pkgetmask$(EXEEXT): $(pkgetmask_OBJECTS) $(pkgetmask_DEPENDENCIES) @rm -f pkgetmask$(EXEEXT) $(CXXLINK) $(pkgetmask_OBJECTS) $(pkgetmask_LDADD) $(LIBS) @@ -570,6 +580,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pkextract.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pkfillnodata.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pkfilter.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pkfs_svm.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pkgetmask.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pkinfo.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pklas2img.Po@am__quote@ diff --git a/src/apps/pkclassify_svm.cc b/src/apps/pkclassify_svm.cc index 43508cb..6d5e911 100644 --- a/src/apps/pkclassify_svm.cc +++ b/src/apps/pkclassify_svm.cc @@ -1,5 +1,5 @@ /********************************************************************** -pkclassify_svm.cc: classify raster image using Artificial Neural Network +pkclassify_svm.cc: classify raster image using Support Vector Machine Copyright (C) 2008-2012 Pieter Kempeneers This file is part of pktools @@ -149,7 +149,7 @@ int main(int argc, char *argv[]) exit(0); } if(help_opt[0]){ - std::cout << "usage: pkclassify_nn -i testimage -o outputimage -t training [OPTIONS]" << std::endl; + std::cout << "usage: pkclassify_svm -i testimage -o outputimage -t training [OPTIONS]" << std::endl; exit(0); } @@ -174,7 +174,6 @@ int main(int argc, char *argv[]) unsigned int totalSamples=0; int nreclass=0; vector<int> vcode;//unique class codes in recode string - // vector<FANN::neural_net> net(nbag);//the neural network vector<struct svm_model*> svm(nbag); vector<struct svm_parameter> param(nbag); diff --git a/src/apps/pkfs_svm.cc b/src/apps/pkfs_svm.cc new file mode 100644 index 0000000..03b56c8 --- /dev/null +++ b/src/apps/pkfs_svm.cc @@ -0,0 +1,549 @@ +/********************************************************************** +pkfs_svm.cc: feature selection for svm classifier +Copyright (C) 2008-2012 Pieter Kempeneers + +This file is part of pktools + +pktools is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +pktools is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with pktools. If not, see <http://www.gnu.org/licenses/>. +***********************************************************************/ +#include <vector> +#include <map> +#include <algorithm> +#include "imageclasses/ImgReaderGdal.h" +#include "imageclasses/ImgWriterGdal.h" +#include "imageclasses/ImgReaderOgr.h" +#include "imageclasses/ImgWriterOgr.h" +#include "base/Optionpk.h" +#include "algorithms/ConfusionMatrix.h" +#include "algorithms/FeatureSelector.h" +#include "algorithms/svm.h" +#include "pkclassify_nn.h" + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#define Malloc(type,n) (type *)malloc((n)*sizeof(type)) + +double getCost(const vector<Vector2d<float> > &trainingFeatures) +{ + //test + std::cout << "debug0" << std::endl; + bool verbose=true; + unsigned short nFeatures=trainingFeatures[0][0].size(); + //todo: set SVM parameters through command line options + struct svm_parameter param; + // param.svm_type = svm_type_opt[0]; + // param.kernel_type = kernel_type_opt[0]; + // param.degree = kernel_degree_opt[0]; + // param.gamma = (gamma_opt[0]>0)? gamma_opt[0] : 1.0/nFeatures; + // param.coef0 = coef0_opt[0]; + // param.nu = nu_opt[0]; + // param.cache_size = cache_opt[0]; + // param.C = ccost_opt[0]; + // param.eps = epsilon_tol_opt[0]; + // param.p = epsilon_loss_opt[0]; + // param.shrinking = (shrinking_opt[0])? 1 : 0; + // param.probability = (prob_est_opt[0])? 1 : 0; + // param.nr_weight = 0;//not used: I use priors and balancing + // param.weight_label = NULL; + // param.weight = NULL; + //todo: make parameter + unsigned int cv=2; + param.svm_type = 0; + param.kernel_type = 2; + param.degree = 3; + param.gamma = 1.0/nFeatures; + param.coef0 = 0; + param.nu = 0.5; + param.cache_size = 100.0; + param.C = 1; + param.eps = 0.001; + param.p = 0.1; + param.shrinking = 0; + param.probability = 0; + param.nr_weight = 0;//not used: I use priors and balancing + param.weight_label = NULL; + param.weight = NULL; + struct svm_model* svm; + struct svm_problem prob; + struct svm_node* x_space; + unsigned int ntraining=0; + unsigned short nclass=trainingFeatures.size(); + + //test + std::cout << "debug1" << std::endl; + + for(int iclass=0;iclass<nclass;++iclass) + ntraining+=trainingFeatures[iclass].size(); + + prob.l=ntraining; + prob.y = Malloc(double,prob.l); + prob.x = Malloc(struct svm_node *,prob.l); + //test + std::cout << "debug2" << std::endl; + x_space = Malloc(struct svm_node,(nFeatures+1)*ntraining); + unsigned long int spaceIndex=0; + int lIndex=0; + for(int iclass=0;iclass<nclass;++iclass){ + for(int isample=0;isample<trainingFeatures[iclass].size();++isample){ + //test + std::cout << "..." << iclass << std::endl; + prob.x[lIndex]=&(x_space[spaceIndex]); + for(int ifeature=0;ifeature<nFeatures;++ifeature){ + x_space[spaceIndex].index=ifeature+1; + x_space[spaceIndex].value=trainingFeatures[iclass][isample][ifeature]; + //test + std::cout << " " << x_space[spaceIndex].index << ":" << x_space[spaceIndex].value; + ++spaceIndex; + } + x_space[spaceIndex++].index=-1; + prob.y[lIndex]=iclass; + ++lIndex; + } + } + //test + std::cout << "debug3" << std::endl; + + assert(lIndex==prob.l); + if(verbose) + std::cout << "checking parameters" << std::endl; + svm_check_parameter(&prob,¶m); + if(verbose) + std::cout << "parameters ok, training" << std::endl; + svm=svm_train(&prob,¶m); + if(verbose) + std::cout << "SVM is now trained" << std::endl; + + std::cout << "Confusion matrix" << std::endl; + ConfusionMatrix cm(nclass); + double *target = Malloc(double,prob.l); + svm_cross_validation(&prob,¶m,cv,target); + assert(param.svm_type != EPSILON_SVR&¶m.svm_type != NU_SVR);//only for regression + int total_correct=0; + for(int i=0;i<prob.l;i++) + cm.incrementResult(cm.getClass(prob.y[i]),cm.getClass(target[i]),1); + assert(cm.nReference()); + // std::cout << "Kappa: " << cm.kappa() << std::endl; + // double se95_oa=0; + // double doa=0; + // doa=cm.oa_pct(&se95_oa); + // std::cout << "Overall Accuracy: " << doa << " (" << se95_oa << ")" << std::endl; + + // *NOTE* Because svm_model contains pointers to svm_problem, you can + // not free the memory used by svm_problem if you are still using the + // svm_model produced by svm_train(). + // however, we will re-train the svm later on after the feature selection + free(target); + free(prob.y); + free(prob.x); + free(x_space); + svm_free_and_destroy_model(&(svm)); + return(cm.kappa()); +} + +int main(int argc, char *argv[]) +{ + map<short,int> reclassMap; + vector<int> vreclass; + vector<double> priors; + + //--------------------------- command line options ------------------------------------ + + std::string versionString="version "; + versionString+=VERSION; + versionString+=", Copyright (C) 2008-2012 Pieter Kempeneers.\n\ + This program comes with ABSOLUTELY NO WARRANTY; for details type use option -h.\n\ + This is free software, and you are welcome to redistribute it\n\ + under certain conditions; use option --license for details."; + Optionpk<bool> version_opt("\0","version",versionString,false); + Optionpk<bool> license_opt("lic","license","show license information",false); + Optionpk<bool> help_opt("h","help","shows this help info",false); + Optionpk<bool> todo_opt("\0","todo","",false); + Optionpk<string> training_opt("t", "training", "training shape file. A single shape file contains all training features (must be set as: B0, B1, B2,...) for all classes (class numbers identified by label option). Use multiple training files for bootstrap aggregation (alternative to the bag and bsize options, where a random subset is taken from a single training file)"); + Optionpk<string> label_opt("\0", "label", "identifier for class label in training shape file.","label"); + Optionpk<unsigned short> maxFeatures_opt("n", "nf", "number of features to select (0 to select all)", 0); + Optionpk<unsigned short> reclass_opt("\0", "rc", "reclass code (e.g. --rc=12 --rc=23 to reclass first two classes to 12 and 23 resp.).", 0); + Optionpk<unsigned int> balance_opt("\0", "balance", "balance the input data to this number of samples for each class", 0); + Optionpk<int> minSize_opt("m", "min", "if number of training pixels is less then min, do not take this class into account", 0); + Optionpk<double> start_opt("s", "start", "start band sequence number (set to 0)",0); + Optionpk<double> end_opt("e", "end", "end band sequence number (set to 0 for all bands)", 0); + Optionpk<double> offset_opt("\0", "offset", "offset value for each spectral band input features: refl[band]=(DN[band]-offset[band])/scale[band]", 0.0); + Optionpk<double> scale_opt("\0", "scale", "scale value for each spectral band input features: refl=(DN[band]-offset[band])/scale[band] (use 0 if scale min and max in each band to -1.0 and 1.0)", 0.0); + Optionpk<unsigned short> aggreg_opt("a", "aggreg", "how to combine aggregated classifiers, see also rc option (0: no aggregation, 1: sum rule, 2: max rule).",0); + Optionpk<double> priors_opt("p", "prior", "prior probabilities for each class (e.g., -p 0.3 -p 0.3 -p 0.2 )", 0.0); + Optionpk<unsigned short> svm_type_opt("svmt", "svmtype", "type of SVM (0: C-SVC, 1: nu-SVC, 2: one-class SVM, 3: epsilon-SVR, 4: nu-SVR)",0); + Optionpk<unsigned short> kernel_type_opt("kt", "kerneltype", "type of kernel function (0: linear: u'*v, 1: polynomial: (gamma*u'*v + coef0)^degree, 2: radial basis function: exp(-gamma*|u-v|^2), 3: sigmoid: tanh(gamma*u'*v + coef0), 4: precomputed kernel (kernel values in training_set_file)",2); + Optionpk<unsigned short> kernel_degree_opt("kd", "kd", "degree in kernel function",3); + Optionpk<float> gamma_opt("g", "gamma", "gamma in kernel function",0); + Optionpk<float> coef0_opt("c0", "coef0", "coef0 in kernel function",0); + Optionpk<float> ccost_opt("cc", "ccost", "the parameter C of C-SVC, epsilon-SVR, and nu-SVR",1); + Optionpk<float> nu_opt("nu", "nu", "the parameter nu of nu-SVC, one-class SVM, and nu-SVR",0.5); + Optionpk<float> epsilon_loss_opt("eloss", "eloss", "the epsilon in loss function of epsilon-SVR",0.1); + Optionpk<int> cache_opt("cache", "cache", "cache memory size in MB",100); + Optionpk<float> epsilon_tol_opt("etol", "etol", "the tolerance of termination criterion",0.001); + Optionpk<bool> shrinking_opt("shrink", "shrink", "whether to use the shrinking heuristics",false); + Optionpk<bool> prob_est_opt("pe", "probest", "whether to train a SVC or SVR model for probability estimates",false); + // Optionpk<bool> weight_opt("wi", "wi", "set the parameter C of class i to weight*C, for C-SVC",true); + Optionpk<unsigned int> cv_opt("cv", "cv", "n-fold cross validation mode",2); + Optionpk<short> verbose_opt("v", "verbose", "set to: 0 (results only), 1 (confusion matrix), 2 (debug)",0); + + version_opt.retrieveOption(argc,argv); + license_opt.retrieveOption(argc,argv); + help_opt.retrieveOption(argc,argv); + todo_opt.retrieveOption(argc,argv); + training_opt.retrieveOption(argc,argv); + maxFeatures_opt.retrieveOption(argc,argv); + label_opt.retrieveOption(argc,argv); + reclass_opt.retrieveOption(argc,argv); + balance_opt.retrieveOption(argc,argv); + minSize_opt.retrieveOption(argc,argv); + start_opt.retrieveOption(argc,argv); + end_opt.retrieveOption(argc,argv); + offset_opt.retrieveOption(argc,argv); + scale_opt.retrieveOption(argc,argv); + aggreg_opt.retrieveOption(argc,argv); + priors_opt.retrieveOption(argc,argv); + svm_type_opt.retrieveOption(argc,argv); + kernel_type_opt.retrieveOption(argc,argv); + kernel_degree_opt.retrieveOption(argc,argv); + gamma_opt.retrieveOption(argc,argv); + coef0_opt.retrieveOption(argc,argv); + ccost_opt.retrieveOption(argc,argv); + nu_opt.retrieveOption(argc,argv); + epsilon_loss_opt.retrieveOption(argc,argv); + cache_opt.retrieveOption(argc,argv); + epsilon_tol_opt.retrieveOption(argc,argv); + shrinking_opt.retrieveOption(argc,argv); + prob_est_opt.retrieveOption(argc,argv); + cv_opt.retrieveOption(argc,argv); + verbose_opt.retrieveOption(argc,argv); + + if(version_opt[0]||todo_opt[0]){ + std::cout << version_opt.getHelp() << std::endl; + std::cout << "todo: " << todo_opt.getHelp() << std::endl; + exit(0); + } + if(license_opt[0]){ + std::cout << Optionpk<bool>::getGPLv3License() << std::endl; + exit(0); + } + if(help_opt[0]){ + std::cout << "usage: pkfs_svm -t training [OPTIONS]" << std::endl; + exit(0); + } + + assert(training_opt[0].size()); + if(verbose_opt[0]>=1) + std::cout << "training shape file: " << training_opt[0] << std::endl; + + unsigned int totalSamples=0; + int nreclass=0; + vector<int> vcode;//unique class codes in recode string + + unsigned short nclass=0; + int nband=0; + int startBand=2;//first two bands represent X and Y pos + + vector<double> offset; + vector<double> scale; + vector< Vector2d<float> > trainingPixels;//[class][sample][band] + + if(reclass_opt.size()>1){ + vreclass.resize(reclass_opt.size()); + for(int iclass=0;iclass<reclass_opt.size();++iclass){ + reclassMap[iclass]=reclass_opt[iclass]; + vreclass[iclass]=reclass_opt[iclass]; + } + } + if(priors_opt.size()>1){//priors from argument list + priors.resize(priors_opt.size()); + double normPrior=0; + for(int iclass=0;iclass<priors_opt.size();++iclass){ + priors[iclass]=priors_opt[iclass]; + normPrior+=priors[iclass]; + } + //normalize + for(int iclass=0;iclass<priors_opt.size();++iclass) + priors[iclass]/=normPrior; + } + + + //----------------------------------- Training ------------------------------- + struct svm_problem prob; + vector<string> fields; + //organize training data + trainingPixels.clear(); + map<int,Vector2d<float> > trainingMap; + if(verbose_opt[0]>=1) + std::cout << "reading imageShape file " << training_opt[0] << std::endl; + try{ + totalSamples=readDataImageShape(training_opt[0],trainingMap,fields,start_opt[0],end_opt[0],label_opt[0],verbose_opt[0]); + if(trainingMap.size()<2){ + string errorstring="Error: could not read at least two classes from training file"; + throw(errorstring); + } + } + catch(string error){ + cerr << error << std::endl; + exit(1); + } + catch(...){ + cerr << "error catched" << std::endl; + exit(1); + } + //delete class 0 + if(verbose_opt[0]>=1) + std::cout << "erasing class 0 from training set (" << trainingMap[0].size() << " from " << totalSamples << ") samples" << std::endl; + totalSamples-=trainingMap[0].size(); + trainingMap.erase(0); + //convert map to vector + short iclass=0; + if(reclass_opt.size()==1){//no reclass option, read classes from shape + reclassMap.clear(); + vreclass.clear(); + } + if(verbose_opt[0]>1) + std::cout << "training pixels: " << std::endl; + map<int,Vector2d<float> >::iterator mapit=trainingMap.begin(); + while(mapit!=trainingMap.end()){ + // for(map<int,Vector2d<float> >::const_iterator mapit=trainingMap.begin();mapit!=trainingMap.end();++mapit){ + //delete small classes + if((mapit->second).size()<minSize_opt[0]){ + trainingMap.erase(mapit); + continue; + //todo: beware of reclass option: delete this reclass if no samples are left in this classes!! + } + if(reclass_opt.size()==1){//no reclass option, read classes from shape + reclassMap[iclass]=(mapit->first); + vreclass.push_back(mapit->first); + } + trainingPixels.push_back(mapit->second); + if(verbose_opt[0]>1) + std::cout << mapit->first << ": " << (mapit->second).size() << " samples" << std::endl; + ++iclass; + ++mapit; + } + nclass=trainingPixels.size(); + nband=trainingPixels[0][0].size()-2;//X and Y//trainingPixels[0][0].size(); + assert(reclassMap.size()==nclass); + + //do not remove outliers here: could easily be obtained through ogr2ogr -where 'B2<110' output.shp input.shp + //balance training data + if(balance_opt[0]>0){ + if(random) + srand(time(NULL)); + totalSamples=0; + for(int iclass=0;iclass<nclass;++iclass){ + if(trainingPixels[iclass].size()>balance_opt[0]){ + while(trainingPixels[iclass].size()>balance_opt[0]){ + int index=rand()%trainingPixels[iclass].size(); + trainingPixels[iclass].erase(trainingPixels[iclass].begin()+index); + } + } + else{ + int oldsize=trainingPixels[iclass].size(); + for(int isample=trainingPixels[iclass].size();isample<balance_opt[0];++isample){ + int index = rand()%oldsize; + trainingPixels[iclass].push_back(trainingPixels[iclass][index]); + } + } + totalSamples+=trainingPixels[iclass].size(); + } + assert(totalSamples==nclass*balance_opt[0]); + } + + //set scale and offset + offset.resize(nband); + scale.resize(nband); + if(offset_opt.size()>1) + assert(offset_opt.size()==nband); + if(scale_opt.size()>1) + assert(scale_opt.size()==nband); + Histogram hist; + for(int iband=0;iband<nband;++iband){ + if(verbose_opt[0]>=1) + std::cout << "scaling for band" << iband << std::endl; + offset[iband]=(offset_opt.size()==1)?offset_opt[0]:offset_opt[iband]; + scale[iband]=(scale_opt.size()==1)?scale_opt[0]:scale_opt[iband]; + //search for min and maximum + if(scale[iband]<=0){ + float theMin=trainingPixels[0][0][iband+startBand]; + float theMax=trainingPixels[0][0][iband+startBand]; + for(int iclass=0;iclass<nclass;++iclass){ + for(int isample=0;isample<trainingPixels[iclass].size();++isample){ + if(theMin>trainingPixels[iclass][isample][iband+startBand]) + theMin=trainingPixels[iclass][isample][iband+startBand]; + if(theMax<trainingPixels[iclass][isample][iband+startBand]) + theMax=trainingPixels[iclass][isample][iband+startBand]; + } + } + offset[iband]=theMin+(theMax-theMin)/2.0; + scale[iband]=(theMax-theMin)/2.0; + if(verbose_opt[0]>=1){ + std::cout << "Extreme image values for band " << iband << ": [" << theMin << "," << theMax << "]" << std::endl; + std::cout << "Using offset, scale: " << offset[iband] << ", " << scale[iband] << std::endl; + std::cout << "scaled values for band " << iband << ": [" << (theMin-offset[iband])/scale[iband] << "," << (theMax-offset[iband])/scale[iband] << "]" << std::endl; + } + } + } + + //recode vreclass to ordered vector, starting from 0 to nreclass + vcode.clear(); + if(verbose_opt[0]>=1){ + std::cout << "before recoding: " << std::endl; + for(int iclass = 0; iclass < vreclass.size(); iclass++) + std::cout << " " << vreclass[iclass]; + std::cout << std::endl; + } + vector<int> vord=vreclass;//ordered vector, starting from 0 to nreclass + map<short,int> mreclass; + for(int ic=0;ic<vreclass.size();++ic){ + if(mreclass.find(vreclass[ic])==mreclass.end()) + mreclass[vreclass[ic]]=iclass++; + } + for(int ic=0;ic<vreclass.size();++ic) + vord[ic]=mreclass[vreclass[ic]]; + //construct uniqe class codes + while(!vreclass.empty()){ + vcode.push_back(*(vreclass.begin())); + //delete all these entries from vreclass + vector<int>::iterator vit; + while((vit=find(vreclass.begin(),vreclass.end(),vcode.back()))!=vreclass.end()) + vreclass.erase(vit); + } + if(verbose_opt[0]>=1){ + std::cout << "recode values: " << std::endl; + for(int icode=0;icode<vcode.size();++icode) + std::cout << vcode[icode] << " "; + std::cout << std::endl; + } + vreclass=vord; + if(verbose_opt[0]>=1){ + std::cout << "after recoding: " << std::endl; + for(int iclass = 0; iclass < vord.size(); iclass++) + std::cout << " " << vord[iclass]; + std::cout << std::endl; + } + + vector<int> vuniqueclass=vreclass; + //remove duplicate elements from vuniqueclass + sort( vuniqueclass.begin(), vuniqueclass.end() ); + vuniqueclass.erase( unique( vuniqueclass.begin(), vuniqueclass.end() ), vuniqueclass.end() ); + nreclass=vuniqueclass.size(); + if(verbose_opt[0]>=1){ + std::cout << "unique classes: " << std::endl; + for(int iclass = 0; iclass < vuniqueclass.size(); iclass++) + std::cout << " " << vuniqueclass[iclass]; + std::cout << std::endl; + std::cout << "number of reclasses: " << nreclass << std::endl; + } + + if(priors_opt.size()==1){//default: equal priors for each class + priors.resize(nclass); + for(int iclass=0;iclass<nclass;++iclass) + priors[iclass]=1.0/nclass; + } + assert(priors_opt.size()==1||priors_opt.size()==nclass); + + if(verbose_opt[0]>=1){ + std::cout << "number of bands: " << nband << std::endl; + std::cout << "number of classes: " << nclass << std::endl; + std::cout << "priors:"; + for(int iclass=0;iclass<nclass;++iclass) + std::cout << " " << priors[iclass]; + std::cout << std::endl; + } + + //Calculate features of trainig set + vector< Vector2d<float> > trainingFeatures(nclass); + for(int iclass=0;iclass<nclass;++iclass){ + int nctraining=0; + if(verbose_opt[0]>=1) + std::cout << "calculating features for class " << iclass << std::endl; + if(random) + srand(time(NULL)); + nctraining=trainingPixels[iclass].size();//bagSize_opt[0] given in % of training size + if(verbose_opt[0]>=1) + std::cout << "nctraining class " << iclass << ": " << nctraining << std::endl; + int index=0; + + trainingFeatures[iclass].resize(nctraining); + for(int isample=0;isample<nctraining;++isample){ + //scale pixel values according to scale and offset!!! + for(int iband=0;iband<nband;++iband){ + assert(trainingPixels[iclass].size()>isample); + assert(trainingPixels[iclass][isample].size()>iband+startBand); + assert(offset.size()>iband); + assert(scale.size()>iband); + float value=trainingPixels[iclass][isample][iband+startBand]; + trainingFeatures[iclass][isample].push_back((value-offset[iband])/scale[iband]); + } + } + assert(trainingFeatures[iclass].size()==nctraining); + } + + unsigned int ntraining=0; + for(int iclass=0;iclass<nclass;++iclass){ + std::cout << "..." << iclass << "..." << std::endl; + if(verbose_opt[0]>=1) + std::cout << "training sample size for class " << vcode[iclass] << ": " << trainingFeatures[iclass].size() << std::endl; + ntraining+=trainingFeatures[iclass].size(); + } + + int nFeatures=trainingFeatures[0][0].size(); + int maxFeatures=(maxFeatures_opt[0])? maxFeatures_opt[0] : nFeatures; + list<int> subset;//set of selected features (levels) for each class combination + double cost=0; + FeatureSelector selector; + if(maxFeatures==nFeatures){ + subset.clear(); + for(int ifeature=0;ifeature<nFeatures;++ifeature) + subset.push_back(ifeature); + // cost=getCost(trainingFeatures[iclass1],trainingFeatures[iclass2]); + } + else{ + try{ + //test + std::cout << "debug3" << std::endl; + cost=selector.floating(trainingFeatures,&getCost,subset,maxFeatures,(verbose_opt[0]>0) ? true:false); + } + catch(...){ + std::cout << "catched feature selection" << std::endl; + exit(1); + } + } + if(verbose_opt[0]){ + cout <<"cost: " << cost << endl; + cout << "levels: "; + for(list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit){ + cout << *lit; + if((*(lit))!=subset.back()) + cout <<","; + else + cout << endl; + } + } + + // *NOTE* Because svm_model contains pointers to svm_problem, you can + // not free the memory used by svm_problem if you are still using the + // svm_model produced by svm_train(). + + // free(prob.y); + // free(prob.x); + // free(x_space); + // svm_destroy_param(¶m); + return 0; +} + -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-grass/pktools.git _______________________________________________ Pkg-grass-devel mailing list Pkg-grass-devel@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-grass-devel