This is an automated email from the git hooks/post-receive script. tille pushed a commit to annotated tag upstream/0.3.1 in repository libpll.
commit 89cb22dd7a401e45463439ca9156089ecb981095 Author: Andreas Tille <[email protected]> Date: Wed May 17 16:35:34 2017 +0200 New upstream version 0.3.1 --- .travis.yml | 42 ++++++++---- ChangeLog.md | 11 ++++ configure.ac | 2 +- man/libpll.3 | 7 +- src/Makefile.am | 3 +- src/core_derivatives.c | 11 ++-- src/core_derivatives_avx.c | 10 +-- src/core_derivatives_avx2.c | 10 +-- src/core_derivatives_sse.c | 8 +-- src/core_pmatrix.c | 4 +- src/hardware.c | 152 ++++++++++++++++++++++++++++++-------------- src/init.c | 34 ---------- src/models.c | 16 +++-- src/pll.c | 5 +- src/pll.h | 34 +++++++--- 15 files changed, 219 insertions(+), 130 deletions(-) diff --git a/.travis.yml b/.travis.yml index d205bfa..5f71157 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,9 +7,27 @@ matrix: addons: apt: sources: ['ubuntu-toolchain-r-test'] + packages: ['gcc-4.6'] + env: + - COMPILER=gcc-4.6 CONFPARAMS="--disable-avx2" + + - os: linux + compiler: gcc + addons: + apt: + sources: ['ubuntu-toolchain-r-test'] + packages: ['gcc-4.7'] + env: + - COMPILER=gcc-4.7 CONFPARAMS="" + + - os: linux + compiler: gcc + addons: + apt: + sources: ['ubuntu-toolchain-r-test'] packages: ['gcc-4.8'] env: - - COMPILER=gcc-4.8 + - COMPILER=gcc-4.8 CONFPARAMS="" - os: linux compiler: gcc @@ -18,16 +36,16 @@ matrix: sources: ['ubuntu-toolchain-r-test'] packages: ['gcc-4.9'] env: - - COMPILER=gcc-4.9 + - COMPILER=gcc-4.9 CONFPARAMS="" - os: linux compiler: gcc addons: apt: sources: ['ubuntu-toolchain-r-test'] - packages: ['g++-5'] + packages: ['gcc-5'] env: - - COMPILER=gcc-5 + - COMPILER=gcc-5 CONFPARAMS="" - os: linux compiler: gcc @@ -36,7 +54,7 @@ matrix: sources: ['ubuntu-toolchain-r-test'] packages: ['gcc-6'] env: - - COMPILER=gcc-6 + - COMPILER=gcc-6 CONFPARAMS="" - os: linux compiler: clang @@ -45,7 +63,7 @@ matrix: sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.5'] packages: ['clang-3.5'] env: - - COMPILER=clang-3.5 + - COMPILER=clang-3.5 CONFPARAMS="" - os: linux compiler: clang @@ -54,7 +72,7 @@ matrix: sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.6'] packages: ['clang-3.6'] env: - - COMPILER=clang-3.6 + - COMPILER=clang-3.6 CONFPARAMS="" - os: linux compiler: clang @@ -63,7 +81,7 @@ matrix: sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.7'] packages: ['clang-3.7'] env: - - COMPILER=clang-3.7 + - COMPILER=clang-3.7 CONFPARAMS="" - os: linux compiler: clang @@ -72,7 +90,7 @@ matrix: sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.8'] packages: ['clang-3.8'] env: - - COMPILER=clang-3.8 + - COMPILER=clang-3.8 CONFPARAMS="" - os: linux dist: trusty @@ -82,7 +100,7 @@ matrix: sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty-3.9'] packages: ['clang-3.9'] env: - - COMPILER=clang-3.9 + - COMPILER=clang-3.9 CONFPARAMS="" - os: linux dist: trusty @@ -92,6 +110,6 @@ matrix: sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty-4.0'] packages: ['clang-4.0'] env: - - COMPILER=clang-4.0 + - COMPILER=clang-4.0 CONFPARAMS="" -script: ./autogen.sh && CC=$COMPILER ./configure && make && make check +script: ./autogen.sh && CC=$COMPILER ./configure $CONFPARAMS && make && make check diff --git a/ChangeLog.md b/ChangeLog.md index ed3e43e..e1b325c 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,6 +2,17 @@ All notable changes to `libpll` will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [0.3.1] - 2017-05-17 +### Added + - Checks for older versions of clang and gcc to use assembly instructions + for cpu features detection + - Include guards for pll.h +### Fixed + - Correct updating of padded eigen-decomposition arrays for models with a + number of states not being a power of two + - Changed to the usage of builtin functions for cpu features detection + - Check for x86intrin.h + ## [0.3.0] - 2017-05-15 ### Added - Run-time detection of cpu features diff --git a/configure.ac b/configure.ac index 62ba2db..377b391 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) -AC_INIT([libpll], [0.3.0], [[email protected]]) +AC_INIT([libpll], [0.3.1], [[email protected]]) AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C]) AC_CONFIG_SRCDIR([src/pll.c]) diff --git a/man/libpll.3 b/man/libpll.3 index 2ff6344..e2f1dc1 100644 --- a/man/libpll.3 +++ b/man/libpll.3 @@ -1,6 +1,6 @@ .\" -*- coding: utf-8 -*- .\" ============================================================================ -.TH libpll 3 "September 9, 2016" "libpll 0.1.0" "Library Functions Manual" +.TH libpll 3 "May 17, 2017" "libpll 0.3.1" "Library Functions Manual" .\" ============================================================================ .SH NAME libpll \(em Phylogenetic Likelihood Library @@ -586,5 +586,10 @@ for custom printing. Fixed derivatives computation, parsing of branch lengths, invariant sites computation, log-likelihood computation for cases where we have scaling and patterns, ascertainment bias computation, per-site log-likelihood computation, memory leaks. Added run-time detection of hardware. +.TP +.BR v0.3.1\~ "released May 17th, 2017" +Correct updating of paddded eigen-decomposition arrays for models with a number +of states not being a power of two. Added portable hardware detection for clang +and GCC. .RE .LP diff --git a/src/Makefile.am b/src/Makefile.am index 23c16ed..977595b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -37,8 +37,7 @@ fast_parsimony.c \ stepwise.c \ random.c \ phylip.c \ -hardware.c \ -init.c +hardware.c libpll_la_CFLAGS = $(AM_CFLAGS) diff --git a/src/core_derivatives.c b/src/core_derivatives.c index a5fea48..2b6e205 100644 --- a/src/core_derivatives.c +++ b/src/core_derivatives.c @@ -146,6 +146,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, const double * t_inv_eigenvecs; const double * t_freqs; + unsigned int states_padded = states; + #ifdef HAVE_SSE3 if (attrib & PLL_ATTRIB_ARCH_SSE && PLL_STAT(sse3_present)) { @@ -251,8 +253,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii(unsigned int states, righterm = 0; for (k = 0; k < states; ++k) { - lefterm += t_clvp[k] * t_freqs[k] * t_inv_eigenvecs[k * states + j]; - righterm += t_eigenvecs[j * states + k] * t_clvc[k]; + lefterm += t_clvp[k] * t_freqs[k] * + t_inv_eigenvecs[k * states_padded + j]; + righterm += t_eigenvecs[j * states_padded + k] * t_clvc[k]; } sum[j] = lefterm * righterm; @@ -385,8 +388,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states, for (k = 0; k < states; ++k) { lefterm += (tipstate & 1) * t_freqs[k] - * t_inv_eigenvecs[k * states + j]; - righterm += t_eigenvecs[j * states + k] * t_clvc[k]; + * t_inv_eigenvecs[k * states_padded + j]; + righterm += t_eigenvecs[j * states_padded + k] * t_clvc[k]; tipstate >>= 1; } sum[j] = lefterm * righterm; diff --git a/src/core_derivatives_avx.c b/src/core_derivatives_avx.c index 54f03ff..992e41b 100644 --- a/src/core_derivatives_avx.c +++ b/src/core_derivatives_avx.c @@ -273,9 +273,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states, for (k = 0; k < states; ++k) { tt_inv_eigenvecs[i * states_padded * states_padded + j * states_padded - + k] = inv_eigenvecs[i][k * states + j] * t_freqs[k]; + + k] = inv_eigenvecs[i][k * states_padded + j] * t_freqs[k]; tt_eigenvecs[i * states_padded * states_padded + j * states_padded - + k] = eigenvecs[i][j * states + k]; + + k] = eigenvecs[i][j * states_padded + k]; } } @@ -636,7 +636,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, for (k = 0; k < states_padded; ++k) { eigenvecs_padded[i*states_padded*states_padded + j*states_padded + k] = - (j < states && k < states) ? eigenvecs[i][j*states + k] : 0.; + (j < states && k < states) ? eigenvecs[i][j*states_padded + k] : 0.; } } @@ -659,7 +659,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, /* special case for non-ambiguous state */ __m256d v_freqs = _mm256_set1_pd(freqs[i][ss]); __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] + - ss*states + j); + ss*states_padded + j); v_lefterm = _mm256_mul_pd(v_eigen, v_freqs); } else @@ -671,7 +671,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states, { __m256d v_freqs = _mm256_set1_pd(freqs[i][k]); __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] + - k*states + j); + k*states_padded + j); v_lefterm = _mm256_add_pd(v_lefterm, _mm256_mul_pd(v_eigen, v_freqs)); diff --git a/src/core_derivatives_avx2.c b/src/core_derivatives_avx2.c index 20216d5..bd345ec 100644 --- a/src/core_derivatives_avx2.c +++ b/src/core_derivatives_avx2.c @@ -98,9 +98,9 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states, for (k = 0; k < states; ++k) { tt_inv_eigenvecs[i * states_padded * states_padded + j * states_padded - + k] = inv_eigenvecs[i][k * states + j] * t_freqs[k]; + + k] = inv_eigenvecs[i][k * states_padded + j] * t_freqs[k]; tt_eigenvecs[i * states_padded * states_padded + j * states_padded - + k] = eigenvecs[i][j * states + k]; + + k] = eigenvecs[i][j * states_padded + k]; } } @@ -281,7 +281,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, for (k = 0; k < states_padded; ++k) { eigenvecs_padded[i*states_padded*states_padded + j*states_padded + k] = - (j < states && k < states) ? eigenvecs[i][j*states + k] : 0.; + (j < states && k < states) ? eigenvecs[i][j*states_padded + k] : 0.; } } @@ -304,7 +304,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, /* special case for non-ambiguous state */ __m256d v_freqs = _mm256_set1_pd(freqs[i][ss]); __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] + - ss*states + j); + ss*states_padded + j); v_lefterm = _mm256_mul_pd(v_eigen, v_freqs); } else @@ -316,7 +316,7 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states, { __m256d v_freqs = _mm256_set1_pd(freqs[i][k]); __m256d v_eigen = _mm256_load_pd(inv_eigenvecs[i] + - k*states + j); + k*states_padded + j); v_lefterm = _mm256_fmadd_pd(v_eigen, v_freqs, v_lefterm); } diff --git a/src/core_derivatives_sse.c b/src/core_derivatives_sse.c index f9849dd..cf77d84 100644 --- a/src/core_derivatives_sse.c +++ b/src/core_derivatives_sse.c @@ -208,8 +208,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_sse(unsigned int states, for (k = 0; k < states; ++k) { - lterm += clvp[k] * freqs[k] * invev[k*states+j]; - rterm += ev[j*states+k] * clvc[k]; + lterm += clvp[k] * freqs[k] * invev[k*states_padded+j]; + rterm += ev[j*states_padded+k] * clvc[k]; } sum[j] = lterm*rterm; @@ -288,8 +288,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states, for (k = 0; k < states; ++k) { - lterm += (tipstate & 1) * freqs[k] * invev[k*states+j]; - rterm += ev[j*states+k] * clvc[k]; + lterm += (tipstate & 1) * freqs[k] * invev[k*states_padded+j]; + rterm += ev[j*states_padded+k] * clvc[k]; tipstate >>= 1; } sum[j] = lterm*rterm; diff --git a/src/core_pmatrix.c b/src/core_pmatrix.c index 4fb1e84..77aedb5 100644 --- a/src/core_pmatrix.c +++ b/src/core_pmatrix.c @@ -214,7 +214,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix, for (j = 0; j < states; ++j) for (k = 0; k < states; ++k) - temp[j*states+k] = inv_evecs[j*states+k] * expd[k]; + temp[j*states+k] = inv_evecs[j*states_padded+k] * expd[k]; for (j = 0; j < states; ++j) { @@ -224,7 +224,7 @@ PLL_EXPORT int pll_core_update_pmatrix(double ** pmatrix, for (m = 0; m < states; ++m) { pmat[j*states_padded+k] += - temp[j*states+m] * evecs[m*states+k]; + temp[j*states+m] * evecs[m*states_padded+k]; } } } diff --git a/src/hardware.c b/src/hardware.c index 49d414a..8bbd151 100644 --- a/src/hardware.c +++ b/src/hardware.c @@ -21,21 +21,52 @@ #include "pll.h" -#ifndef __PPC__ -#define cpuid(f1, f2, a, b, c, d) \ - __asm__ __volatile__ ("cpuid" \ - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ - : "a" (f1), "c" (f2)); -#endif +#if (!defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ < 8))) || \ + (defined(__clang__) && (__clang_major__ < 3 || \ + (__clang_major__ == 3 && __clang_minor__ < 9))) + + #if defined(__i386__) && defined(__PIC__) + #if (defined(__GNUC__) && __GNUC__ < 3) +#define cpuid(level, count, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %k1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %k1\n\t" \ + : "=a" (a), "=&r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) + #else +#define cpuid(level, count, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %k1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %k1\n\t" \ + : "=a" (a), "=&r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) + #endif + #elif defined(__x86_64__) && (defined(__code_model_medium__) || \ + defined(__code_model_large__)) && defined(__PIC__) +#define cpuid(level, count, a, b, c, d) \ + __asm__ ("xchg{q}\t{%%}rbx, %q1\n\t" \ + "cpuid\n\t" \ + "xchg{q}\t{%%}rbx, %q1\n\t" \ + : "=a" (a), "=&r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) + #else +#define cpuid(level, count, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) + #endif static void cpu_features_detect() { unsigned int a,b,c,d; - memset(pll_hardware,0,sizeof(pll_hardware_t)); + memset(&pll_hardware,0,sizeof(pll_hardware_t)); + + pll_hardware.init = 1; -#ifdef __PPC__ - pll_hardware->altivec_present = 1; +#if defined(__PPC__) + pll_hardware.altivec_present = 1; #else cpuid(0,0,a,b,c,d); @@ -44,56 +75,75 @@ static void cpu_features_detect() if (maxlevel >= 1) { cpuid(1,0,a,b,c,d); - pll_hardware->mmx_present = (d >> 23) & 1; - pll_hardware->sse_present = (d >> 25) & 1; - pll_hardware->sse2_present = (d >> 26) & 1; - pll_hardware->sse3_present = (c >> 0) & 1; - pll_hardware->ssse3_present = (c >> 9) & 1; - pll_hardware->sse41_present = (c >> 19) & 1; - pll_hardware->sse42_present = (c >> 20) & 1; - pll_hardware->popcnt_present = (c >> 23) & 1; - pll_hardware->avx_present = (c >> 28) & 1; + pll_hardware.mmx_present = (d >> 23) & 1; + pll_hardware.sse_present = (d >> 25) & 1; + pll_hardware.sse2_present = (d >> 26) & 1; + pll_hardware.sse3_present = (c >> 0) & 1; + pll_hardware.ssse3_present = (c >> 9) & 1; + pll_hardware.sse41_present = (c >> 19) & 1; + pll_hardware.sse42_present = (c >> 20) & 1; + pll_hardware.popcnt_present = (c >> 23) & 1; + pll_hardware.avx_present = (c >> 28) & 1; if (maxlevel >= 7) { cpuid(7,0,a,b,c,d); - pll_hardware->avx2_present = (b >> 5) & 1; + pll_hardware.avx2_present = (b >> 5) & 1; } } #endif } +#else + + +static void cpu_features_detect() +{ + memset(&pll_hardware,0,sizeof(pll_hardware_t)); + + pll_hardware.init = 1; +#if defined(__PPC__) + pll_hardware.altivec_present = __builtin_cpu_supports("altivec"); +#elif defined(__x86_64__) || defined(__i386__) + pll_hardware.mmx_present = __builtin_cpu_supports("mmx"); + pll_hardware.sse_present = __builtin_cpu_supports("sse"); + pll_hardware.sse2_present = __builtin_cpu_supports("sse2"); + pll_hardware.sse3_present = __builtin_cpu_supports("sse3"); + pll_hardware.ssse3_present = __builtin_cpu_supports("ssse3"); + pll_hardware.sse41_present = __builtin_cpu_supports("sse4.1"); + pll_hardware.sse42_present = __builtin_cpu_supports("sse4.2"); + pll_hardware.popcnt_present = __builtin_cpu_supports("popcnt"); + pll_hardware.avx_present = __builtin_cpu_supports("avx"); + pll_hardware.avx2_present = __builtin_cpu_supports("avx2"); +#endif +} + +#endif + static void cpu_features_show() { - if (!pll_hardware) - { - /* TODO: Add proper error control after we figure out - cross-platform compatibility */ - return; - } - fprintf(stderr, "CPU features:"); - if (pll_hardware->altivec_present) + if (pll_hardware.altivec_present) fprintf(stderr, " altivec"); - if (pll_hardware->mmx_present) + if (pll_hardware.mmx_present) fprintf(stderr, " mmx"); - if (pll_hardware->sse_present) + if (pll_hardware.sse_present) fprintf(stderr, " sse"); - if (pll_hardware->sse2_present) + if (pll_hardware.sse2_present) fprintf(stderr, " sse2"); - if (pll_hardware->sse3_present) + if (pll_hardware.sse3_present) fprintf(stderr, " sse3"); - if (pll_hardware->ssse3_present) + if (pll_hardware.ssse3_present) fprintf(stderr, " ssse3"); - if (pll_hardware->sse41_present) + if (pll_hardware.sse41_present) fprintf(stderr, " sse4.1"); - if (pll_hardware->sse42_present) + if (pll_hardware.sse42_present) fprintf(stderr, " sse4.2"); - if (pll_hardware->popcnt_present) + if (pll_hardware.popcnt_present) fprintf(stderr, " popcnt"); - if (pll_hardware->avx_present) + if (pll_hardware.avx_present) fprintf(stderr, " avx"); - if (pll_hardware->avx2_present) + if (pll_hardware.avx2_present) fprintf(stderr, " avx2"); fprintf(stderr, "\n"); } @@ -101,15 +151,6 @@ static void cpu_features_show() PLL_EXPORT int pll_hardware_probe() { /* probe cpu features */ - if (!pll_hardware) - { - if (!(pll_hardware = (pll_hardware_t *)calloc(1,sizeof(pll_hardware_t)))) - { - pll_errno = PLL_ERROR_MEM_ALLOC; - snprintf(pll_errmsg, 200, "Unable to allocate enough memory."); - return PLL_FAILURE; - } - } cpu_features_detect(); return PLL_SUCCESS; @@ -117,5 +158,24 @@ PLL_EXPORT int pll_hardware_probe() PLL_EXPORT void pll_hardware_dump() { + if (!pll_hardware.init) + pll_hardware_probe(); + cpu_features_show(); } + +PLL_EXPORT void pll_hardware_ignore() +{ + pll_hardware.init = 1; + pll_hardware.altivec_present = 1; + pll_hardware.mmx_present = 1; + pll_hardware.sse_present = 1; + pll_hardware.sse2_present = 1; + pll_hardware.sse3_present = 1; + pll_hardware.ssse3_present = 1; + pll_hardware.sse41_present = 1; + pll_hardware.sse42_present = 1; + pll_hardware.popcnt_present = 1; + pll_hardware.avx_present = 1; + pll_hardware.avx2_present = 1; +} diff --git a/src/init.c b/src/init.c deleted file mode 100644 index 90d5bd3..0000000 --- a/src/init.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - Copyright (C) 2017 Tomas Flouri - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as - published by the Free Software Foundation, either version 3 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - - Contact: Tomas Flouri <[email protected]>, - Exelixis Lab, Heidelberg Instutute for Theoretical Studies - Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany -*/ - -#include "pll.h" - -PLL_EXPORT void pll_init() -{ - pll_hardware_probe(); -} - -PLL_EXPORT void pll_fini() -{ - if (pll_hardware) - free(pll_hardware); - pll_hardware = NULL; -} diff --git a/src/models.c b/src/models.c index 6081283..0148f92 100644 --- a/src/models.c +++ b/src/models.c @@ -262,6 +262,7 @@ PLL_EXPORT int pll_update_eigen(pll_partition_t * partition, double * subst_params = partition->subst_params[params_index]; unsigned int states = partition->states; + unsigned int states_padded = partition->states_padded; a = create_ratematrix(subst_params, freqs, @@ -291,25 +292,32 @@ PLL_EXPORT int pll_update_eigen(pll_partition_t * partition, /* store eigen vectors */ for (i = 0; i < states; ++i) - memcpy(eigenvecs + i*states, a[i], states*sizeof(double)); + memcpy(eigenvecs + i*states_padded, a[i], states*sizeof(double)); /* store eigen values */ memcpy(eigenvals, d, states*sizeof(double)); /* store inverse eigen vectors */ for (k = 0, i = 0; i < states; ++i) - for (j = i; j < states*states; j += states) + { + for (j = i; j < states_padded*states; j += states_padded) inv_eigenvecs[k++] = eigenvecs[j]; + /* account for padding */ + k += states_padded - states; + } + + assert(k == states_padded*states); + /* multiply the inverse eigen vectors from the left with sqrt(pi)^-1 */ for (i = 0; i < states; ++i) for (j = 0; j < states; ++j) - inv_eigenvecs[i*states+ j] /= sqrt(freqs[i]); + inv_eigenvecs[i*states_padded+ j] /= sqrt(freqs[i]); /* multiply the eigen vectors from the right with sqrt(pi) */ for (i = 0; i < states; ++i) for (j = 0; j < states; ++j) - eigenvecs[i*states+j] *= sqrt(freqs[j]); + eigenvecs[i*states_padded+j] *= sqrt(freqs[j]); partition->eigen_decomp_valid[params_index] = 1; diff --git a/src/pll.c b/src/pll.c index 50254aa..299d0cd 100644 --- a/src/pll.c +++ b/src/pll.c @@ -24,7 +24,7 @@ __thread int pll_errno; __thread char pll_errmsg[200] = {0}; -pll_hardware_t * pll_hardware = NULL; +pll_hardware_t pll_hardware = {0,0,0,0,0,0,0,0,0,0,0,0}; static void dealloc_partition_data(pll_partition_t * partition); @@ -604,6 +604,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips, "Unable to allocate enough memory for eigenvectors."); return PLL_FAILURE; } + memset(partition->eigenvecs[i], 0, states * states_padded * sizeof(double)); /* TODO: don't forget to add code for SSE/AVX */ } @@ -632,6 +633,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips, "Unable to allocate enough memory for inverse eigenvectors."); return PLL_FAILURE; } + memset(partition->inv_eigenvecs[i], 0, states * states_padded * sizeof(double)); /* TODO: don't forget to add code for SSE/AVX */ } @@ -660,6 +662,7 @@ PLL_EXPORT pll_partition_t * pll_partition_create(unsigned int tips, "Unable to allocate enough memory for eigenvalues."); return PLL_FAILURE; } + memset(partition->eigenvals[i], 0, states_padded * sizeof(double)); /* TODO: don't forget to add code for SSE/AVX */ } diff --git a/src/pll.h b/src/pll.h index fe38a76..83daa9c 100644 --- a/src/pll.h +++ b/src/pll.h @@ -19,6 +19,8 @@ Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ +#ifndef PLL_H +#define PLL_H #include <assert.h> #include <math.h> #include <stdio.h> @@ -26,12 +28,28 @@ #include <stdint.h> #include <string.h> #include <ctype.h> -#include <x86intrin.h> #ifdef HAVE_CONFIG_H #include "config.h" #endif +#if (!defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ < 7))) + #if ((__GNUC__ == 4) && (__GNUC_MINOR__ == 6)) + #if (defined(HAVE_AVX2)) + #error "GCC 4.6.x. Please run ./configure --disable-avx2" + #endif + #else + #if (defined(HAVE_AVX2) || defined(HAVE_AVX)) + #error "GCC < 4.6. Please run ./configure --disable-avx --disable-avx2" + #endif + #endif +#endif + +#ifdef HAVE_X86INTRIN_H +#include <x86intrin.h> +#endif + /* platform specific */ #if (!defined(__APPLE__) && !defined(__WIN32__) && !defined(__WIN64__)) @@ -49,7 +67,8 @@ #define PLL_MIN(a,b) ((a) < (b) ? (a) : (b)) #define PLL_MAX(a,b) ((a) > (b) ? (a) : (b)) #define PLL_SWAP(x,y) do { __typeof__ (x) _t = x; x = y; y = _t; } while(0) -#define PLL_STAT(x) (pll_hardware && pll_hardware->x) +#define PLL_STAT(x) ((pll_hardware.init || pll_hardware_probe()) \ + && pll_hardware.x) /* constants */ @@ -163,6 +182,7 @@ typedef struct pll_hardware_s { + int init; /* cpu features */ int altivec_present; int mmx_present; @@ -451,7 +471,7 @@ struct pll_random_data PLL_EXPORT extern __thread int pll_errno; PLL_EXPORT extern __thread char pll_errmsg[200]; -PLL_EXPORT extern pll_hardware_t * pll_hardware; +PLL_EXPORT extern pll_hardware_t pll_hardware; PLL_EXPORT extern const unsigned int pll_map_bin[256]; PLL_EXPORT extern const unsigned int pll_map_nt[256]; @@ -1875,13 +1895,9 @@ PLL_EXPORT int pll_hardware_probe(void); PLL_EXPORT void pll_hardware_dump(); -/* functions in init.c */ - -PLL_EXPORT void pll_init(void) __attribute__((constructor)); - -PLL_EXPORT void pll_fini(void) __attribute__((destructor)); - +PLL_EXPORT void pll_hardware_ignore(); #ifdef __cplusplus } /* extern "C" */ #endif +#endif -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libpll.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
