This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository proteinortho.
commit a6e05f7c0c82073170f52b8322ca933e2c160488 Author: Andreas Tille <[email protected]> Date: Wed May 4 14:50:54 2016 +0200 Imported Upstream version 5.13+dfsg --- Makefile | 17 +++--- manual.html | 6 +- po2tree.pl | 4 +- po_tree.c | 8 +-- proteinortho5.pl | 4 +- proteinortho5_clean_edges.cpp | 136 ------------------------------------------ proteinortho5_clustering.cpp | 80 ++++++++++++++++++++++--- 7 files changed, 90 insertions(+), 165 deletions(-) diff --git a/Makefile b/Makefile index 3c8b861..648850e 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,18 @@ INSTALLDIR=/usr/local/bin -CPP = g++ -CPPFLAGS = -Wall -O3 +CPP = g++ +CPPFLAGS += -Wall -O3 -Wno-unused-result -all: proteinortho5_clustering proteinortho5_clean_edges proteinortho5_tree +all: proteinortho5_clustering proteinortho5_tree proteinortho5_clustering: proteinortho5_clustering.cpp - $(CPP) $(CPPFLAGS) -o $@ $< + $(CPP) $(CPPFLAGS) $(LDFLAGS) -o $@ $< -proteinortho5_tree: proteinortho5_clustering.cpp - $(CPP) $(CPPFLAGS) -o $@ $< +proteinortho5_tree: po_tree.c + $(CPP) $(CPPFLAGS) $(LDFLAGS) -o $@ $< -proteinortho5_clean_edges: proteinortho5_clean_edges.cpp - $(CPP) $(CPPFLAGS) -o $@ $< - -install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po_tree po_tree.pl +install: proteinortho5.pl proteinortho5_clustering proteinortho5_singletons.pl proteinortho5_clean_edges2.pl ffadj_mcs.py po2tree.pl proteinortho5_tree install -v $^ $(INSTALLDIR) test: proteinortho5.pl proteinortho5_clustering diff --git a/manual.html b/manual.html index fac0228..8ae8d68 100644 --- a/manual.html +++ b/manual.html @@ -4,7 +4,7 @@ </head> <body> <h1>Proteinortho Manual / PoFF Manual</h1> -<small>This manual corresponds to version 5.12</small> +<small>This manual corresponds to version 5.13</small> <h2>Introduction</h2> Proteinortho is a tool to detect orthologous genes within different species. For doing so, it compares similarities of given gene sequences and clusters them to find significant groups. @@ -38,8 +38,8 @@ The sources come with a precompiled version of Proteinortho for 64bit Linux. If <h3>Building and installing from source</h3> <ul> <li> Fetch the latest source code archive from <a href="http://www.bioinf.uni-leipzig.de/Software/proteinortho/" target="_blank">www.bioinf.uni-leipzig.de/Software/proteinortho/</a>. -<li> Extract the files e.g. via <code>tar -xzvf proteinortho_v5.06.tar.gz</code> -<li> Change directory into the extracted folder e.g. via <code>cd proteinortho_v5.06</code> +<li> Extract the files e.g. via <code>tar -xzvf proteinortho_v5.13.tar.gz</code> +<li> Change directory into the extracted folder e.g. via <code>cd proteinortho_v5.13</code> <li> You can now run <code>./proteinortho5.pl</code> directly <li> If you want to recompile and install Proteinortho, type <code>make</code> followed <code>sudo make install</code> (requires root privileges). <li> In any case, run <code>make test</code> to make sure Proteinortho works as expected diff --git a/po2tree.pl b/po2tree.pl index 77d5435..e1a0590 100755 --- a/po2tree.pl +++ b/po2tree.pl @@ -34,7 +34,7 @@ # @email [email protected] # @company Bioinformatics, University of Leipzig # @version 3.10 -# @date 08-12-2015 +# @date 16-03-2016 # ########################################################################################## @@ -142,7 +142,7 @@ system("cat '$file.tmp.matrix2' >>'$file.tmp.matrix'"); # Run the main algorithm in C print STDERR "Calculating tree...\n"; -my $run = $scriptpath."po_tree"; +my $run = $scriptpath."proteinortho5_tree"; my $out = qx($run $ARGV[0].tmp.matrix); $out =~ s/\[.+?\]//g; $out =~ s/\s:/:/g; diff --git a/po_tree.c b/po_tree.c index 8b6604d..769cf85 100644 --- a/po_tree.c +++ b/po_tree.c @@ -29,7 +29,7 @@ * @email [email protected] * @company Bioinformatics, University of Leipzig * @version 1.10 (for PO5) - * @date 2016-02-22 + * @date 2016-03-16 */ @@ -105,7 +105,7 @@ int main(int argc, char** argv) { void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen) { /* affinity matrix */ int** aff = (int**)malloc(sizeof(int*)*viecher); - int i; + unsigned int i; /* allocate further memory for affinity matrix */ for (i = 0; i<viecher; i++) { aff[i] = (int*)malloc(sizeof(int)*viecher); @@ -146,7 +146,7 @@ void builder(BOOL** matrix, Uint viecher, Uint ccs, char*** pnamen) { * returns: nothing, but all involved data-structures represent the merge afterwards correctly */ void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int** aff, char*** pnamen) { - int i, counter = 0; + unsigned int i, counter = 0; /* only keep components that are present in both species */ /* foreach component */ @@ -199,7 +199,7 @@ void merge (Uint* max, Uint ccs, Uint viecher, BOOL* away, BOOL** matrix, int** baum = strcat(baum,(*pnamen)[max[1]]); baum = strcat(baum,":"); baum = strcat(baum,length_1); -/ baum = strcat(baum,")"); + baum = strcat(baum,")"); // baum = strcat(baum,")["); // baum = strcat(baum,anz); // baum = strcat(baum,"]"); diff --git a/proteinortho5.pl b/proteinortho5.pl index 719e64f..22bbe97 100755 --- a/proteinortho5.pl +++ b/proteinortho5.pl @@ -30,7 +30,7 @@ # @author Marcus Lechner # @email [email protected] # @company University of Maruburg -# @date 2016-02-22 +# @date 2016-04-22 # ########################################################################################## @@ -47,7 +47,7 @@ use Thread::Queue; ########################################################################################## # Variables ########################################################################################## -our $version = "5.12"; +our $version = "5.13"; our $step = 0; # 0/1/2/3 -> do all / only apply step 1 / only apply step 2 / only apply step 3 our $verbose = 1; # 0/1 -> don't / be verbose our $debug = 0; # 0/1 -> don't / show debug data diff --git a/proteinortho5_clean_edges.cpp b/proteinortho5_clean_edges.cpp deleted file mode 100644 index 0f81b3a..0000000 --- a/proteinortho5_clean_edges.cpp +++ /dev/null @@ -1,136 +0,0 @@ -#include <iostream> -#include <sstream> -#include <fstream> -#include <string> -#include <map> -#include <vector> - -using namespace std; - -int parse_excluded_edges(string filename, map<string, int> &excluded_edges); -void tokenize(const string& str, vector<string>& tokens, const string& delimiters); -int rewrite_graph(const string in, map<string, int> exclude); -string chomp(string &str); - - -int main(const int argc, char *argv[]) { - - string usage = "USAGE: "+string(argv[0])+" -e <EXCLUSION FILE> GRAPH1 GRAPH2 ..."; - - vector<string> graphs; - map<string, int> exclude; - for(int paras = 1; paras < argc; paras++) { - string arg = string(argv[paras]); - if( argc < 4 || arg == "-h" || arg == "--help") { - cout << usage; - return 0; - } else if(arg == "-e") { - parse_excluded_edges(string(argv[++paras]), exclude); - } else { - graphs.push_back(arg); - } - } - - for(vector<string>::iterator it = graphs.begin(); it != graphs.end(); it++) { - rewrite_graph(*it, exclude); - } - - for(map<string, int>::iterator it = exclude.begin(); it != exclude.end(); it++) { - //cout << it->first << "\n"; - } - -} - -int rewrite_graph(const string in, map<string, int> exclude) { - ifstream graph_file(in.c_str()); - if(!graph_file.is_open()) { - cerr << "could not read input file "+in; - return -1; - } - - int count = 0; - string line; - while (getline(graph_file, line)) { - chomp(line); - vector<string> fields; - tokenize(line, fields, "\t"); - if(fields.size() > 1 && fields[0].substr(0, 1) != "#") { - string a = fields[0]; - string b = fields[1]; - string c = b+" "+a; - if(a.compare(b) < 0 ) { - c = a+" "+b; - } - //cout << c << endl; - if(exclude.count(c) == 1) { - count++; - continue; - } - } - - cout << line << endl; - } - - graph_file.close(); - - cerr << "# excluded " << count << " edges from the graph." << endl; - return 0; -} - -string chomp(string &str) { - string::size_type pos = str.find_last_not_of("\n\r \t"); - if(pos != string::npos) { - str = str.substr(0, pos+1); - } - return str; -} - -int parse_excluded_edges(string filename, map<string, int> &excluded_edges) { - ifstream data(filename.c_str()); - if(!data.is_open()) { - cerr << "could not open file"+filename; - } - - string line; - while(getline(data,line)) { - - chomp(line); - vector<string> v; - tokenize(line, v, "\t"); - - if(v.size() < 2) { - continue; - } - string a = v[0]; - string b = v[1]; - string c = b + " " + a; - - if(a.compare(b) < 0) { - c = a + " " + b; - } - //cout << c << endl; - excluded_edges[c] = 1; - } - - data.close(); - - return 0; -} - -// Split a string at a certain delim -void tokenize(const string& str, vector<string>& tokens, const string& delimiters = "\t") { - // Skip delimiters at beginning. - string::size_type lastPos = str.find_first_not_of(delimiters, 0); - // Find first "non-delimiter". - string::size_type pos = str.find_first_of(delimiters, lastPos); - - while (string::npos != pos || string::npos != lastPos) { - // Found a token, add it to the vector. - tokens.push_back(str.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of(delimiters, pos); - // Find next "non-delimiter" - pos = str.find_first_of(delimiters, lastPos); - } -} - diff --git a/proteinortho5_clustering.cpp b/proteinortho5_clustering.cpp index a7bbba1..6e07ae5 100755 --- a/proteinortho5_clustering.cpp +++ b/proteinortho5_clustering.cpp @@ -3,7 +3,7 @@ * Reads edge list and splits connected components * according to algebraic connectivity threshold * - * Last updated: 2014/07/07 + * Last updated: 2016/04/22 * Author: Marcus Lechner */ @@ -17,6 +17,7 @@ #include <vector> #include <stack> #include <iomanip> +#include <cstdlib> using namespace std; // Structs @@ -252,7 +253,7 @@ void partition_graph() { // Connectivity analysis float connectivity = getConnectivity(current_group); - if (connectivity < param_con_threshold) { + if (connectivity < param_con_threshold && current_group.size() > 3) { // Split groups is done in getConnectivity function // Reset flags and repeat without incrementing protein counter for (unsigned int i = 0; i < current_group.size(); i++) done[current_group[i]] = false; @@ -260,7 +261,7 @@ void partition_graph() { } // Output - print_group(current_group,connectivity); + if (connectivity >= param_con_threshold) {print_group(current_group,connectivity);} // Print stats stats(protein_id,protein_counter); @@ -300,6 +301,20 @@ vector<unsigned int> get_deg_one (vector<unsigned int>& nodes) { vector<float> generate_random_vector(const unsigned int size) { vector<float> x(size); for (unsigned int i = 0; i < size; i++) { + x[i] = (float)(rand() % 999+1)/1000; // 1 bis 99 + // at least one value must be different from the others but still within 0 and 1 + if (i > 0 && x[i] == x[i-1]) { + x[i] /= 3; + } +// cerr << x[i] << endl; + } + return x; +} + +// Generate random vector x of size size +vector<float> generate_random_vector_old(const unsigned int size) { + vector<float> x(size); + for (unsigned int i = 0; i < size; i++) { x[i] = (float)rand()/RAND_MAX; // at least one value must be different from the others but still within 0 and 1 if (i > 0 && x[i] == x[i-1]) { @@ -361,8 +376,43 @@ vector<float> getY(float max_degree, vector<float> x_hat, vector<float> x_new, v return x_hat; } + // Remove edges connectiong two groups a and b void removeExternalEdges(map<unsigned int,bool>& a) { +// cerr << "+#" << endl; +// for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) { +// unsigned int protein = it->first; +// cerr << protein << endl; +// } +// cerr << "#-" << endl; + + // For each protein in a + for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) { + unsigned int protein = it->first; + // For each target + vector<unsigned int> cleaned_edges; + bool swap = false; + for (vector<unsigned int>::iterator ita = graph[protein].edges.begin(); ita != graph[protein].edges.end(); ita++) { + // If it is not present the own group, set flag + if (a.find(*ita) == a.end()) { + graph_clean << graph[protein].full_name << "\t" << graph[*ita ].full_name << endl; // Improved graph cleaning + swap = true; + } + // Otherwise, add it to the new edge list + else { + cleaned_edges.push_back(*ita); + } + } + // If changes were made, swap edge list with new one + if (swap) { + cleaned_edges.swap(graph[protein].edges); + } + } +} + + +// Remove edges connectiong two groups a and b +void removeExternalEdges_old(map<unsigned int,bool>& a) { // For each protein in a for (map<unsigned int,bool>::iterator it = a.begin(); it != a.end(); it++) { unsigned int protein = it->first; @@ -388,7 +438,7 @@ void removeExternalEdges(map<unsigned int,bool>& a) { } // Split connected component according to eigenvector -void splitGroups(vector<float>& y, vector<unsigned int>& nodes){ +void splitGroups(vector<float>& y, vector<unsigned int>& nodes, map<unsigned int,unsigned int> mapping){ // Remove tree like structures in the beginning vector<unsigned int> one = get_deg_one(nodes); if (one.size() > 0) { @@ -408,6 +458,8 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes){ else {groupB[nodes[i]] = true;} // cerr << graph[nodes[i]].full_name << " {color:#b01700}" << endl; } } +// cerr << groupA.size() << " -- " << groupB.size() << endl; + // Catch error in laplacien calcs if (groupA.size() == 0 || groupB.size() == 0) { throw "Failed to partition subgraph! This might lead to an infinit loop. Please submit the .blastgraph file to [email protected] to help fixing this issue."; @@ -418,6 +470,18 @@ void splitGroups(vector<float>& y, vector<unsigned int>& nodes){ } float getConnectivity(vector<unsigned int>& nodes) { + // Special cases hotfix + if (nodes.size() == 2) {return 1;} + + if (nodes.size() == 3) { + vector<unsigned int> min = get_deg_one(nodes); + // fully connected + if (min.size() == 0) {return 1;} + // not + else {return 0.667;} + } + // Hotfix end + // Get max degree of nodes unsigned int max_degree = max_deg(nodes); @@ -445,17 +509,17 @@ float getConnectivity(vector<unsigned int>& nodes) { x_hat = makeOrthogonal(y); // Get lenght (lambda) & normalize vector norm = nomalize(x_hat, ¤t_length); -// cerr << "IT: " << current_length << endl; - if (abs(current_length-last_length) < 0.000333 && iter >= 10) break; // min 10 iterations (prevent convergence by chance) + if (abs(current_length-last_length) < 0.0001 && iter >= 20) break; // min 20 iterations (prevent convergence by chance) } - // cerr << nodes.size() << " nodes done after " << iter << " iterations" << endl; float connectivity = (-current_length+2*max_degree)/(nodes.size()); +// cerr << nodes.size() << " " << connectivity << endl; + // Split groups if connectivity is too low, remove tree like structures that might have arosen if (connectivity < param_con_threshold) { - splitGroups(x_hat, nodes); + splitGroups(x_hat, nodes, mapping); } return connectivity; -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/proteinortho.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
