Hello community, here is the log from the commit of package tensorflow for openSUSE:Factory checked in at 2019-07-22 12:20:01 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/tensorflow (Old) and /work/SRC/openSUSE:Factory/.tensorflow.new.4126 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "tensorflow" Mon Jul 22 12:20:01 2019 rev:5 rq:716088 version:1.13.1 Changes: -------- --- /work/SRC/openSUSE:Factory/tensorflow/tensorflow.changes 2019-01-21 10:51:01.095970939 +0100 +++ /work/SRC/openSUSE:Factory/.tensorflow.new.4126/tensorflow.changes 2019-07-22 12:20:09.555668006 +0200 @@ -1,0 +2,93 @@ +Wed Jul 17 08:18:34 UTC 2019 - Christian Goll <[email protected]> + +- fixed installation location of shared library + +------------------------------------------------------------------- +Mon Jul 8 14:04:17 UTC 2019 - Christian Goll <[email protected]> + +- removed bazel mirror from as much source links as possible +- added support-new-bazel.patch support newer upcoming bazel + versions + +------------------------------------------------------------------- +Tue Jun 4 14:16:10 UTC 2019 - Guillaume GARDET <[email protected]> + +- Fix build for lite flavor: + * tensorflow-fix_lite.patch + +------------------------------------------------------------------- +Wed May 29 16:11:36 UTC 2019 - Guillaume GARDET <[email protected]> + +- Call ldconfig for devel package in post/postun + +------------------------------------------------------------------- +Mon May 27 15:00:28 UTC 2019 - Guillaume GARDET <[email protected]> + +- Fix aarch64 build with upstream patch: + * tensorflow-make_aws_sdk_work_on_aarch64.patch + +------------------------------------------------------------------- +Mon May 27 04:08:54 UTC 2019 - Guillaume GARDET <[email protected]> + +- Add Lite flavor + +------------------------------------------------------------------- +Fri Apr 26 08:27:55 UTC 2019 - Christian Goll <[email protected]> + +- updated to 1.13.1 fixes boo#1133490 + +------------------------------------------------------------------- +Fri Mar 29 13:06:28 UTC 2019 - Guillaume GARDET <[email protected]> + +- Update _constraints to avoid OOM errors + +------------------------------------------------------------------- +Fri Mar 29 08:18:09 UTC 2019 - Guillaume GARDET <[email protected]> + +- Build and package libtensorflow_cc and libtensorflow_framework + +------------------------------------------------------------------- +Tue Mar 19 15:40:25 UTC 2019 - Christian Goll <[email protected]> + +- added fix_mvapich_mpi_bzl.patch which fixes detection of + mvapich2 mpi library +- fixed python3 build + +------------------------------------------------------------------- +Tue Mar 12 20:33:56 UTC 2019 - Adrian Schröter <[email protected]> + +- update to version 1.13.1 + * Major Features and Improvements + * TensorFlow Lite has moved from contrib to core. This means that Python modules are under tf.lite and source code is now under tensorflow/lite rather than tensorflow/contrib/lite. + * TensorFlow GPU binaries are now built against CUDA 10 and TensorRT 5.0. + * Support for Python3.7 on all operating systems. + * Moved NCCL to core. +- drop merged patch mpilibpath_configure_py.patch +- drop obsolete pyton3.7 patches +- disabled jemalloc for now + +------------------------------------------------------------------- +Tue Feb 12 08:39:57 UTC 2019 - [email protected] + +- enabled aws and googlecloud support + * removed no_aws_and_googlecloud.patch + +------------------------------------------------------------------- +Mon Feb 11 16:27:20 UTC 2019 - Christian Goll <[email protected]> + +- Fixed build issues with python 3.7 what introduced the patches + * python3_7_compatibility.patch backported from upstream + * python3.7_unicode.patch fixes a minor function call + * python3.7_async_keyword.patch avoids the new keyword async + +------------------------------------------------------------------- +Thu Jan 31 11:44:21 UTC 2019 - Bernhard Wiedemann <[email protected]> + +- Fix build with python 3.7 + +------------------------------------------------------------------- +Fri Jan 18 16:45:48 UTC 2019 - Guillaume GARDET <[email protected]> + +- Build and package libtensorflow.so as some packages may link to it + +------------------------------------------------------------------- Old: ---- mpilibpath_configure_py.patch no_aws_and_googlecloud.patch protobuf_v3.6.0.tar.gz re2-2018-04-01.tar.gz tensorflow-1.10.0.tar.gz New: ---- 816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz aws-sdk-cpp-1.3.15.tar.gz bazel-toolchains.tar.gz fft.tgz fix_mvapich_mpi_bzl.patch google-cloud-cpp.tar.gz google-flatbuffers-1.10.0~pre.tar.gz google-nsync-1.20.1.tar.gz grpc.tar.gz kafka-v0.11.5.tar.gz keras-applications-1.0.6.tar.gz keras-preprocessing-1.0.9.tar.gz license.rst.txt master.zip nanopb.tar.gz protobuf_v3.6.1.2.tar.gz re2-2018-10-01.tar.gz release-1.8.0.tar.gz rules_docker.tar.gz support-new-bazel.patch tensorflow-1.13.1.tar.gz tensorflow-fix_lite.patch tensorflow-make_aws_sdk_work_on_aarch64.patch unicode-org-icu.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ tensorflow.spec ++++++ --- /var/tmp/diff_new_pack.3GoBTM/_old 2019-07-22 12:20:14.283666766 +0200 +++ /var/tmp/diff_new_pack.3GoBTM/_new 2019-07-22 12:20:14.287666765 +0200 @@ -18,18 +18,29 @@ # %define pname tensorflow -%define vers 1.10.0 -%define _vers 1_10_10 -%define python_ver_hack python3.6 +%define vers 1.13.1 +%define _vers 1_13_1 +%define python_ver_hack python3.[0-9] %global flavor @BUILD_FLAVOR@%{nil} +# Build tensorflow, not Tensorflow-lite +%define is_lite 0 + %if "%{flavor}" == "standard" %bcond_with cuda %bcond_with mpi %bcond_with opencl %endif +%if "%{flavor}" == "lite" +%define is_lite 1 +%bcond_with cuda +%bcond_with mpi +%bcond_with opencl +%define package_suffix -lite +%endif + %if "%{flavor}" == "hpc" %bcond_with cuda %bcond_with mpi @@ -129,41 +140,48 @@ %{!?compiler_family:%global compiler_family gnu} %{hpc_init -c %compiler_family %{?with_mpi:-m %mpi_flavor} %{?c_f_ver:-v %{c_f_ver}} %{?mpi_ver:-V %{mpi_ver}} %{?ext:-e %{ext}}} %{?with_mpi:%global hpc_module_pname p%{pname}} +%define python_flavor python3 %define package_name %{hpc_package_name %_vers} %define libname(l:s:) lib%{pname}%{-l*}%{hpc_package_name_tail %{?_vers}} %define package_python_sitearch %hpc_python_sitearch %define package_python_sitelib %{hpc_prefix}/lib64/%{python_ver_hack}/site-packages/ %define package_prefix %hpc_prefix %define package_bindir %hpc_bindir +%define package_libdir %hpc_libdir %else %define package_name %pname%{?package_suffix} %define package_python_sitearch %{python3_sitearch} %define package_python_sitelib %{python3_sitelib} %define package_prefix %_prefix %define package_bindir %_bindir +%define package_libdir %_libdir %define libname(l:s:) lib%{pname}%{!-l:%{-s:-}}%{-l*}%{-s*}%{?package_suffix} %endif Name: %{package_name} Version: %vers Release: 0 -#Release: 1%{?config_dependant}%{?dist} Summary: A framework used for deep learning License: Apache-2.0 AND BSD-2-Clause AND BSD-3-Clause AND FSFUL AND MIT AND MPL-2.0 AND OpenSSL AND Python-2.0 Group: Development/Languages/Python Url: https://www.tensorflow.org/ Source0: https://github.com/tensorflow/tensorflow/archive/v%{version}.tar.gz#/tensorflow-%{version}.tar.gz Source1: tensorflow-rpmlintrc +# IMPORTANT +# although some of the following libraries are available in factory they could +# not be used as +# * explicit versions are needed which differ from the factory ones +# * bazel and the obs version have different symbols due to hidden compiler flags # License10: Apache-2.0 Source10: https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz#/rules_closure.tar.gz # License11: BSD-3-Clause -Source11: https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz#/protobuf_v3.6.0.tar.gz +Source11: https://github.com/protocolbuffers/protobuf/archive/v3.6.1.2.tar.gz#/protobuf_v3.6.1.2.tar.gz # License12: Python-2.0 Source12: https://pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz#/backports.weakref-1.0rc1.tar.gz # License13: BSD-3-Clause Source13: https://github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip#/double_conversion.zip # License14: BSD-3-Clause -Source14: https://mirror.bazel.build/pypi.python.org/packages/5c/78/ff794fcae2ce8aa6323e789d1f8b3b7765f601e7702726f430e814822b96/gast-0.2.0.tar.gz#/gast-0.2.0.tar.gz +Source14: https://pypi.python.org/packages/5c/78/ff794fcae2ce8aa6323e789d1f8b3b7765f601e7702726f430e814822b96/gast-0.2.0.tar.gz#/gast-0.2.0.tar.gz # License15: MIT Source15: https://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz#/farmhash.tar.gz # License16: Apache-2.0 @@ -173,37 +191,73 @@ # License18: BSD-3-Clause Source18: https://github.com/hfp/libxsmm/archive/1.9.tar.gz#/libxsmm_1.9.tar.gz # License19: Apache-2.0 -Source19: https://github.com/abseil/abseil-cpp/archive/9613678332c976568272c8f4a78631a29159271d.tar.gz#/abseil-cpp.tar.gz -# License20: BSD-2-Clause -# License21: OpenSSL and ISC and Intel -Source20: https://github.com/google/boringssl/archive/a0fb951d2a26a8ee746b52f3ba81ab011a0af778.tar.gz#/boring_ssl.tar.gz -# License22: Apache-2.0 +Source19: https://github.com/abseil/abseil-cpp/archive/389ec3f906f018661a5308458d623d01f96d7b23.tar.gz#/abseil-cpp.tar.gz +# License20: OpenSSL and ISC and Intel +Source20: https://github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz#/boring_ssl.tar.gz +# License21: Apache-2.0 Source21: https://github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip#/googleapis.zip # License23: Apache-2.0 -Source22: https://mirror.bazel.build/github.com/google/flatbuffers/archive/v1.9.0.tar.gz#/flatbuffers_v1.9.0.tar.gz -# License24: BSD-3-Clause +Source22: https://github.com/google/flatbuffers/archive/v1.9.0.tar.gz#/flatbuffers_v1.9.0.tar.gz +# License23: BSD-3-Clause Source23: https://github.com/NVlabs/cub/archive/1.8.0.zip#/cub_1.8.0.zip -# License25: Apache-2.0 +# License24: Apache-2.0 Source24: https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz#/highwayhash.tar.gz -# License28: Apache-2.0 +# License25: Apache-2.0 Source25: https://github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz#/abseil-pypi-v0.2.2.tar.gz -# License29: MPL-2.0 +# License26: MPL-2.0 # NOTE: tensorflow only uses MPL-2.0 part of eigen -Source26: https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz#/eigen.tar.gz -# License30: BSD-2-Clause -Source27: https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz#/arm_neon_2_x86_sse.tar.gz +Source26: https://bitbucket.org/eigen/eigen/get/9f48e814419e.tar.gz#/eigen.tar.gz +# License27: BSD-2-Clause +Source27: https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz#/arm_neon_2_x86_sse.tar.gz Source28: https://mirror.bazel.build/docs.python.org/2.7/_sources/license.txt#/python-license.txt -# License32: MIT +# License29: MIT Source29: https://github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz#/json-cpp-1.8.4.tar.gz -# License33: FSFUL +# License30: FSFUL Source30: http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz#/fft.tar.gz -# Source34: Apache-2.0 +# License31: Apache-2.0 Source31: https://github.com/grpc/grpc/archive/v1.13.0.tar.gz#/grpc-v1.13.0.gz -# Source35: BSD-3.0 -Source32: https://mirror.bazel.build/github.com/google/re2/archive/2018-04-01.tar.gz#/re2-2018-04-01.tar.gz -# patch the libray search path in configure.py -Patch0: mpilibpath_configure_py.patch -Patch1: no_aws_and_googlecloud.patch +# License32: BSD-3.0 +Source32: https://github.com/google/re2/archive/2018-10-01.tar.gz#/re2-2018-10-01.tar.gz +# License33: Apache-2.0 +Source33: https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz#/aws-sdk-cpp-1.3.15.tar.gz +# License34: BSD-3-Clause and Intel +Source34: https://github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz#/kafka-v0.11.5.tar.gz +# The factory protobuf library has other symbols due to hidden compiler flags +# License35: Apache-2.0 +Source35: https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.4.0.tar.gz#/google-cloud-cpp.tar.gz +# License36: Apache-2.0 +Source36: https://github.com/nlopezgi/bazel-toolchains/archive/3f8c58fe530fedc446de04673bc1e32985887dea.tar.gz#/bazel-toolchains.tar.gz +# License37: Apache-2.0 +Source37: https://github.com/bazelbuild/rules_docker/archive/a9bb1dab84cdf46e34d1b34b53a17bda129b5eba.tar.gz#/rules_docker.tar.gz +# License38: MIT +Source38: https://github.com/keras-team/keras-preprocessing/archive/1.0.9.tar.gz#/keras-preprocessing-1.0.9.tar.gz +# License39: MIT +Source39: https://github.com/keras-team/keras-applications/archive/1.0.6.tar.gz#/keras-applications-1.0.6.tar.gz +# License40: MIT +Source40: https://github.com/google/nsync/archive/1.20.1.tar.gz#/google-nsync-1.20.1.tar.gz +# License41: Apache-2.0 +# something between 1.16.1 and 1.18~pre +Source41: https://github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz#/grpc.tar.gz +# License42: Apache-2.0 +Source42: https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz#/google-flatbuffers-1.10.0~pre.tar.gz +# License43: BSD and ICU License +Source43: https://github.com/unicode-org/icu/archive/release-62-1.tar.gz#/unicode-org-icu.tar.gz +# License44: BSD like +Source44: https://github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz#/nanopb.tar.gz +# License45: Python license itself, do need as sha256b have to match so could not use system one +Source45: https://mirror.bazel.build/docs.python.org/2.7/_sources/license.rst.txt +# Deps sources for Tensorflow-Lite (use same eigen, gemmlowp and abseil_cpp packages as non lite version) +Source100: https://github.com/google/googletest/archive/release-1.8.0.tar.gz +Source101: https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip +Source102: http://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz +# Source103: http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz +Source104: http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz +Patch1: support-new-bazel.patch +Patch2: fix_mvapich_mpi_bzl.patch +# PATCH-FIX-UPSTREAM https://github.com/tensorflow/tensorflow/pull/22856 +Patch3: tensorflow-make_aws_sdk_work_on_aarch64.patch +# PATCH-FIX-OPENSUSE - Use installed flatbuffers lib for Tensorflow-Lite +Patch4: tensorflow-fix_lite.patch Requires: python3 Requires: python3-abseil @@ -219,11 +273,11 @@ %else Provides: python3-tensorflow %endif -BuildRequires: bazel +BuildRequires: bazel == 0.19.2 BuildRequires: curl %if %{with cuda} Requires: cuda-9.0 -BuildRequires cuda-9.0 +BuildRequires: cuda-9.0 %endif %if %{with opencl} Requires: Mesa-libOpenCL @@ -232,11 +286,14 @@ %endif BuildRequires: curl-devel BuildRequires: fdupes +%if %{is_lite} +BuildRequires: flatbuffers-devel +%endif BuildRequires: fftw3-devel BuildRequires: gcc-c++ BuildRequires: giflib-devel #BuildRequires: grpc-devel >= 1.12 -BuildRequires: jemalloc-devel +#BuildRequires: jemalloc-devel BuildRequires: libjpeg-turbo %if 0%{?suse_version} < 1550 BuildRequires: libjpeg62-turbo @@ -251,6 +308,9 @@ BuildRequires: pcre-devel BuildRequires: python3 BuildRequires: python3-Cython +BuildRequires: python3-Keras-Applications +BuildRequires: python3-Keras-Preprocessing +BuildRequires: python3-astor BuildRequires: python3-base BuildRequires: python3-devel BuildRequires: python3-mock @@ -266,20 +326,18 @@ BuildRequires: unzip BuildRequires: zlib-devel %if %{with hpc} +%hpc_requires BuildRequires: %{compiler_family}%{?c_f_ver}-compilers-hpc-macros-devel +BuildRequires: lua-lmod +BuildRequires: suse-hpc %if %{with mpi} BuildRequires: %{mpi_flavor}%{?mpi_vers}-%{compiler_family}%{?c_f_ver}-hpc-macros-devel %endif -BuildRequires: lua-lmod -BuildRequires: suse-hpc -%hpc_requires %endif # just use rpmlint -# there are some serious compiler warnings, regearding no-return-in-nonvoid-function -BuildRequires: -post-build-checks - -BuildRoot: %{_tmppath}/%{name}-%{version}-build +# there are some serious compiler warnings, regarding no-return-in-nonvoid-function +#!BuildRequires: -post-build-checks %if "%flavor" == "" ExclusiveArch: do_not_build @@ -363,40 +421,90 @@ %makebazelcache %{SOURCE30} %makebazelcache %{SOURCE31} %makebazelcache %{SOURCE32} +%makebazelcache %{SOURCE33} +%makebazelcache %{SOURCE34} +%makebazelcache %{SOURCE35} +%makebazelcache %{SOURCE36} +%makebazelcache %{SOURCE37} +%makebazelcache %{SOURCE38} +%makebazelcache %{SOURCE39} +%makebazelcache %{SOURCE40} +%makebazelcache %{SOURCE41} +%makebazelcache %{SOURCE42} +%makebazelcache %{SOURCE43} +%makebazelcache %{SOURCE44} +%makebazelcache %{SOURCE45} # unpack tensorflow %setup -q -c -n tensorflow-%{version} %sanitize_dir pwd -%patch0 -p 1 %patch1 -p 1 +%patch2 -p 1 +%patch3 -p 1 +%patch4 -p 1 echo $MPI_DIR +%if %{is_lite} +mkdir tensorflow/lite/tools/make/downloads/ +pushd tensorflow/lite/tools/make/downloads/ +# eigen, gemmlowp and abseil_cpp +cp %{SOURCE26} %{SOURCE17} %{SOURCE19} . +mkdir tmp +tar xzf eigen.tar.gz -C tmp && mv tmp/* eigen +unzip gemmlowp.zip -d tmp && mv tmp/* gemmlowp +tar xzf %{SOURCE100} -C tmp && mv tmp/* fgoogletest +tar xzf abseil-cpp.tar.gz -C tmp && mv tmp/* absl +unzip %{SOURCE101} -d neon_2_sse +tar xzf %{SOURCE102} -C tmp && mv tmp/* farmhash +# We use installed flatbuffers +# tar xzf %{SOURCE103} -C tmp && mv tmp/* flatbuffers +tar xzf %{SOURCE104} -C tmp && mv tmp/* fft2d +# sed fixes from tensorflow/lite/tools/make/download_dependencies.sh +sed -i -e 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \ + "./eigen/Eigen/src/Core/arch/NEON/Complex.h" +sed -i -e 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#static uint32x2_t p2ui_CONJ_XOR;// = vld1_u32( conj_XOR_DATA ); - Removed by scripts#' \ + "./eigen/Eigen/src/Core/arch/NEON/Complex.h" +sed -i -e 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \ + "./eigen/Eigen/src/Core/arch/NEON/Complex.h" +find -name fixedpoint.h +popd +%endif + %build -%limit_build -m 1600 +%limit_build -m 4000 + +%if %{is_lite} +make %{?_smp_mflags} -f tensorflow/lite/tools/make/Makefile \ + $(pwd)/tensorflow/lite/tools/make/gen/linux_$(uname -m)/lib/libtensorflow-lite.a \ + $(pwd)/tensorflow/lite/tools/make/gen/linux_$(uname -m)/bin/minimal +# Build of benchmark-lib.a is broken +%else %if %{with hpc} %hpc_setup module load gnu %if %{with mpi} module load %mpi_flavor +export MPI_HOME=${MPI_HOME:-$MPI_DIR} %endif #mpi %endif #hpc export TEST_TMPDIR=%{bazeldir} -export PYTHON_LIB_PATH=/usr/lib64/python3.6/site-packages +export PYTHON_LIB_PATH=%{python3_sitearch} export PYTHON_BIN_PATH=/usr/bin/python3 export CC_OPT_FLAGS=-O2 export TF_NEED_JEMALLOC=0 export TF_NEED_GCP=0 -export TF_NEED_HDFS=0 -export TF_NEED_S3=0 +export TF_NEED_HDFS=1 +export TF_NEED_S3=1 export TF_ENABLE_XLA=0 export TF_NEED_VERBS=0 export TF_NEED_OPENCL=0 -export TF_SYSTEM_LIBS="nasm,jpeg,png_archive,org_sqlite,gif_archive,six_archive,astor_archive,termcolor_archive,pcre,swig,curl,lmdb,zlib_archive,snappy,cython,jemalloc" +export TF_NEED_ROCM=0 +export TF_SYSTEM_LIBS="nasm,jpeg,png_archive,org_sqlite,gif_archive,six_archive,astor_archive,termcolor_archive,pcre,swig,curl,lmdb,zlib_archive,snappy,cython" #export TF_SYSTEM_LIBS="com_googlesource_code_re2,nasm,jpeg,png_archive,org_sqlite,gif_archive,six_archive,astor_archive,termcolor_archive,pcre,swig,curl,grpc,lmdb,zlib_archive,snappy,cython,jemalloc" %if %{with cuda} export TF_NEED_CUDA=1 @@ -430,8 +538,21 @@ %{?copts} --jobs %{?jobs} \ //tensorflow/tools/pip_package:build_pip_package bazel-bin/tensorflow/tools/pip_package/build_pip_package %{_topdir}/%{name}-%{version} +bazel build -c opt //tensorflow:libtensorflow.so +bazel build -c opt //tensorflow:libtensorflow_cc.so +%endif %install + +%if %{is_lite} +pushd tensorflow/lite/tools/make/gen/linux_*/ +install -D bin/minimal %{buildroot}%{_bindir}/tflite_minimal +install -D lib/libtensorflow-lite.a %{buildroot}%{_libdir}/libtensorflow-lite.a +popd +install -D tensorflow/lite/schema/schema_generated.h %{buildroot}%{_includedir}/tensorflow/lite/schema/schema_generated.h +install -D tensorflow/lite/schema/schema.fbs %{buildroot}%{_includedir}/tensorflow/lite/schema/schema.fbs +%else + pip install %{_topdir}/%{name}-%{version}/*whl --root=%{buildroot}%{?hpc_prefix} \ --no-warn-script-location --no-index --no-deps # remove spurious executeable bits @@ -444,6 +565,10 @@ rm -r lib cd - %endif +# install libtensorflow*.so +install -D bazel-bin/tensorflow/libtensorflow.so %{buildroot}%{package_libdir}/libtensorflow.so +install -D bazel-bin/tensorflow/libtensorflow_cc.so %{buildroot}%{package_libdir}/libtensorflow_cc.so +install -D bazel-bin/tensorflow/libtensorflow_framework.so %{buildroot}%{package_libdir}/libtensorflow_framework.so # remove external libs %fdupes -s %{buildroot}%{?hpc_prefix} find %{buildroot} -name \*.h -type f -exec chmod 644 {} + @@ -492,6 +617,21 @@ EOF %endif +# %%{is_lite} +%endif + +%post -n %{package_name}-devel -p /sbin/ldconfig +%postun -n %{package_name}-devel -p /sbin/ldconfig + +# Lite version is very different so package it separetly +%if %{is_lite} +%files +%{package_bindir}/* +%files -n %{package_name}-devel +%{package_libdir}/libtensorflow-lite.a +%dir %{_includedir}/tensorflow/lite/schema/ +%{_includedir}/tensorflow/lite/schema/* +%else # not lite build %files %defattr(-,root,root,-) %{package_python_sitearch}/* @@ -504,7 +644,10 @@ %endif %files -n %{package_name}-devel %{package_python_sitelib}/tensorflow/include +%{package_libdir}/libtensorflow*.so %files -n %{package_name}-doc %{package_python_sitelib}/tensorflow/examples +%endif + %changelog ++++++ protobuf_v3.6.0.tar.gz -> 816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz ++++++ ++++ 821455 lines of diff (skipped) ++++++ _constraints ++++++ --- /var/tmp/diff_new_pack.3GoBTM/_old 2019-07-22 12:20:15.227666518 +0200 +++ /var/tmp/diff_new_pack.3GoBTM/_new 2019-07-22 12:20:15.231666517 +0200 @@ -1,7 +1,7 @@ <constraints> <hardware> <memory> - <size unit="M">8192</size> + <size unit="G">10</size> </memory> <disk> <size unit="G">10</size> ++++++ _multibuild ++++++ --- /var/tmp/diff_new_pack.3GoBTM/_old 2019-07-22 12:20:15.247666513 +0200 +++ /var/tmp/diff_new_pack.3GoBTM/_new 2019-07-22 12:20:15.255666511 +0200 @@ -1,5 +1,6 @@ <multibuild> <package>standard</package> + <package>lite</package> <package>hpc</package> <package>hpc-openmpi2</package> <package>hpc-mvapich2</package> ++++++ abseil-cpp.tar.gz ++++++ ++++ 65018 lines of diff (skipped) ++++++ arm_neon_2_x86_sse.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d/NEON_2_SSE.h new/ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f/NEON_2_SSE.h --- old/ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d/NEON_2_SSE.h 2017-05-30 09:44:55.000000000 +0200 +++ new/ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f/NEON_2_SSE.h 2018-04-04 09:24:16.000000000 +0200 @@ -1,6 +1,6 @@ //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, [email protected] -//*** Copyright (C) 2012-2016 Intel Corporation. All rights reserved. +//*** Copyright (C) 2012-2017 Intel Corporation. All rights reserved. //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. @@ -36,21 +36,21 @@ //performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point switching //***************************************************************************************** -//!!!!!!!!!!!!!! To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and complile it as usual -//!!!!!!!!!!!!!! but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom platforms for greater performance. +//!!!!!!!!!!!!!! To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and compile it as usual +//!!!!!!!!!!!!!! but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance. #ifndef NEON2SSE_H #define NEON2SSE_H /*********************************************************************************************************************/ //!!!!!!!!!!!!!! +//if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used +//For older devices without SSE4 support it should be undefined, for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine #ifndef USE_SSE4 #if defined(__SSE4_2__) #define USE_SSE4 #endif #endif -//if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used -//For older devices without SSE4 support it should be undefined, for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine /*********************************************************************************************************************/ #include <xmmintrin.h> //SSE @@ -62,6 +62,7 @@ #include <nmmintrin.h> //SSE4.2 #endif +#include <math.h> //*************** functions and data attributes, compiler dependent ********************************* //*********************************************************************************** @@ -150,6 +151,9 @@ typedef __m128 float16x4_t; //not supported by IA, for compartibility typedef __m128 float16x8_t; //not supported by IA, for compartibility +typedef __m64_128 float64x1_t; +typedef __m128d float64x2_t; + typedef __m128i int8x16_t; typedef __m128i int16x8_t; typedef __m128i int32x4_t; @@ -174,6 +178,9 @@ typedef float __fp16; #endif +typedef double float64_t; + + typedef uint8_t poly8_t; typedef uint16_t poly16_t; @@ -861,6 +868,9 @@ uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0 uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 + +float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0 + //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 @@ -876,6 +886,9 @@ uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0 uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 + +float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0 + //Pairwise addition //Pairwise add int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 @@ -1225,6 +1238,9 @@ float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] + +float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0] + //Load a single lane from memory uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] @@ -1755,6 +1771,7 @@ uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 +int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0 //Convert to float float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 @@ -2003,6 +2020,10 @@ int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 + +int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0 +float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0 + //Saturating absolute: Vd[i] = sat(|Va[i]|) int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0 int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0 @@ -2246,16 +2267,26 @@ poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 +float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0 + +float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0 + +//Sqrt +float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0 + +float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0 + + //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. // we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal // -#if ( ((defined(_MSC_VER)|| defined (__INTEL_COMPILER)) && defined DEBUG ) || defined(__GNUC__) && !defined(__llvm__) ) +#if ( defined (__INTEL_COMPILER) || defined (__GNUC__) && !defined(__llvm__) ) #define _MM_ALIGNR_EPI8 _mm_alignr_epi8 - #define _MM_EXTRACT_EPI16 _mm_extract_epi16 +#define _MM_EXTRACT_EPI16 (int16_t) _mm_extract_epi16 #define _MM_INSERT_EPI16 _mm_insert_epi16 #ifdef USE_SSE4 #define _MM_EXTRACT_EPI8 _mm_extract_epi8 @@ -2328,7 +2359,7 @@ _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p) } - _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE) + _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE) { _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,) } @@ -3117,7 +3148,7 @@ { //no signed average in x86 SIMD, go to unsigned __m128i c128, au, bu, sum; - c128 = _mm_set1_epi8(0x80); //-128 + c128 = _mm_set1_epi8((int8_t)0x80); //-128 au = _mm_sub_epi8(a, c128); //add 128 bu = _mm_sub_epi8(b, c128); //add 128 sum = _mm_avg_epu8(au, bu); @@ -3129,7 +3160,7 @@ { //no signed average in x86 SIMD, go to unsigned __m128i cx8000, au, bu, sum; - cx8000 = _mm_set1_epi16(0x8000); // - 32768 + cx8000 = _mm_set1_epi16((int16_t)0x8000); // - 32768 au = _mm_sub_epi16(a, cx8000); //add 32768 bu = _mm_sub_epi16(b, cx8000); //add 32768 sum = _mm_avg_epu16(au, bu); @@ -4747,7 +4778,7 @@ { // //need to deal with the possibility of internal overflow __m128i c128, au,bu; - c128 = _mm_set1_epi8 (128); + c128 = _mm_set1_epi8((int8_t)128); au = _mm_add_epi8( a, c128); bu = _mm_add_epi8( b, c128); return vhsubq_u8(au,bu); @@ -4758,7 +4789,7 @@ { //need to deal with the possibility of internal overflow __m128i c8000, au,bu; - c8000 = _mm_set1_epi16(0x8000); + c8000 = _mm_set1_epi16((int16_t)0x8000); au = _mm_add_epi16( a, c8000); bu = _mm_add_epi16( b, c8000); return vhsubq_u16(au,bu); @@ -5192,7 +5223,7 @@ return _mm_cmpeq_epi16(cmp, a); //a>=b #else __m128i c8000, as, bs, m1, m2; - c8000 = _mm_set1_epi16 (0x8000); + c8000 = _mm_set1_epi16 ((int16_t)0x8000); as = _mm_sub_epi16(a,c8000); bs = _mm_sub_epi16(b,c8000); m1 = _mm_cmpgt_epi16(as, bs); @@ -5428,7 +5459,7 @@ { //no unsigned chars comparison, only signed available,so need the trick __m128i c128, as, bs; - c128 = _mm_set1_epi8 (128); + c128 = _mm_set1_epi8 ((int8_t)128); as = _mm_sub_epi8(a,c128); bs = _mm_sub_epi8(b,c128); return _mm_cmpgt_epi8 (as, bs); @@ -5439,7 +5470,7 @@ { //no unsigned short comparison, only signed available,so need the trick __m128i c8000, as, bs; - c8000 = _mm_set1_epi16 (0x8000); + c8000 = _mm_set1_epi16 ((int16_t)0x8000); as = _mm_sub_epi16(a,c8000); bs = _mm_sub_epi16(b,c8000); return _mm_cmpgt_epi16 ( as, bs); @@ -6137,6 +6168,11 @@ float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 #define vmaxq_f32 _mm_max_ps + +float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0 +#define vmaxq_f64 _mm_max_pd + + //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************** //*********************************************************************************************************** int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 @@ -6221,6 +6257,11 @@ float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 #define vminq_f32 _mm_min_ps + +float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0 +#define vminq_f64 _mm_min_pd + + //************* Pairwise addition operations. ************************************** //************************************************************************************ //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector @@ -6283,7 +6324,7 @@ uint16x4_t res64; __m128i c32767, cfffe, as, bs, res; c32767 = _mm_set1_epi16 (32767); - cfffe = _mm_set1_epi16 (0xfffe); + cfffe = _mm_set1_epi16 ((int16_t)0xfffe); as = _mm_sub_epi16 (_pM128i(a), c32767); bs = _mm_sub_epi16 (_pM128i(b), c32767); res = _mm_hadd_epi16 (as, bs); @@ -8355,7 +8396,7 @@ // manual saturation solution looks more optimal than 32 bits conversion one __m128i cb, c8000, a_signed, saturation_mask, shift_res; cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 ); - c8000 = _mm_set1_epi16 (0x8000); + c8000 = _mm_set1_epi16 ((int16_t)0x8000); //no unsigned shorts comparison in SSE, only signed available, so need the trick a_signed = _mm_sub_epi16(a, c8000); //go to signed saturation_mask = _mm_cmpgt_epi16 (a_signed, cb); @@ -9196,7 +9237,7 @@ // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead; #define LOAD_SI128(ptr) \ - ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr)) + ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr)) uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] #define vld1q_u8 LOAD_SI128 @@ -9233,7 +9274,7 @@ float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr) { - if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned + if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned return _mm_load_ps(ptr); else return _mm_loadu_ps(ptr); @@ -9288,6 +9329,17 @@ poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] #define vld1_p16 vld1_u16 + +float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0] +_NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr) +{ + if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned + return _mm_load_pd(ptr); + else + return _mm_loadu_pd(ptr); +} + + //*********************************************************************************************************** //******* Lane load functions - insert the data at vector's given position (lane) ************************* //*********************************************************************************************************** @@ -9522,7 +9574,7 @@ // If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val); //here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro #define STORE_SI128(ptr, val) \ - (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val); + (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val); void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] #define vst1q_u8 STORE_SI128 @@ -9554,7 +9606,7 @@ void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val) { - if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned + if( ((uintptr_t)(ptr) & 15) == 0 ) //16 bits aligned _mm_store_ps (ptr, val); else _mm_storeu_ps (ptr, val); @@ -9639,22 +9691,22 @@ //***********Store a lane of a vector into memory (extract given lane) ********************* //****************************************************************************************** void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] -#define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) +#define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane) void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -#define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) +#define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane) void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] -#define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) +#define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane) void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] -#define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) +#define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane) void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] -#define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) +#define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane) void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -#define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) +#define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane) void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) @@ -11881,22 +11933,22 @@ #define vget_lane_f32(vec, lane) vec.m64_f32[lane] uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -#define vgetq_lane_u8 _MM_EXTRACT_EPI8 +#define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] -#define vgetq_lane_u16 _MM_EXTRACT_EPI16 +#define vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -#define vgetq_lane_u32 _MM_EXTRACT_EPI32 +#define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] -#define vgetq_lane_s8 vgetq_lane_u8 +#define vgetq_lane_s8 _MM_EXTRACT_EPI8 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] -#define vgetq_lane_s16 vgetq_lane_u16 +#define vgetq_lane_s16 _MM_EXTRACT_EPI16 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -#define vgetq_lane_s32 vgetq_lane_u32 +#define vgetq_lane_s32 _MM_EXTRACT_EPI32 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] #define vgetq_lane_p8 vgetq_lane_u8 @@ -11920,10 +11972,10 @@ int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -#define vgetq_lane_s64 (int64_t) vgetq_lane_u64 +#define vgetq_lane_s64 _MM_EXTRACT_EPI64 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -#define vgetq_lane_u64 _MM_EXTRACT_EPI64 +#define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64 // ***************** Set lanes within a vector ******************************************** // ************************************************************************************** @@ -12725,6 +12777,13 @@ return vcvtq_u32_f32(_mm_mul_ps(a,cconst128)); } + +int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0 +_NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a) +{ + return _mm_cvtps_epi32(a); +} + //***************** Convert to float ************************* //************************************************************* float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 @@ -14562,6 +14621,22 @@ return _mm_and_ps (a, *(__m128*)c7fffffff); } +#ifdef _NEON2SSE_64BIT +int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0 +_NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0 +{ + __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31); + return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign); +} + +float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0 +_NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0 +{ + _NEON2SSE_ALIGN_16 int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL}; + return _mm_and_pd (a, *(__m128d*)mask); +} +#endif + //****** Saturating absolute: Vd[i] = sat(|Va[i]|) ********************* //********************************************************************** //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place @@ -14596,7 +14671,7 @@ _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0 { __m128i c_128, abs, abs_cmp; - c_128 = _mm_set1_epi8 (0x80); //-128 + c_128 = _mm_set1_epi8 ((int8_t)0x80); //-128 abs = _mm_abs_epi8 (a); abs_cmp = _mm_cmpeq_epi8 (abs, c_128); return _mm_xor_si128 (abs, abs_cmp); @@ -14606,7 +14681,7 @@ _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0 { __m128i c_32768, abs, abs_cmp; - c_32768 = _mm_set1_epi16 (0x8000); //-32768 + c_32768 = _mm_set1_epi16 ((int16_t)0x8000); //-32768 abs = _mm_abs_epi16 (a); abs_cmp = _mm_cmpeq_epi16 (abs, c_32768); return _mm_xor_si128 (abs, abs_cmp); @@ -14919,7 +14994,7 @@ { __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb; cff = _mm_cmpeq_epi8 (a,a); //0xff - c80 = _mm_set1_epi8(0x80); + c80 = _mm_set1_epi8((int8_t)0x80); c1 = _mm_set1_epi8(1); a_mask = _mm_and_si128(a, c80); a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive @@ -16589,4 +16664,46 @@ uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t); #define vreinterpretq_u32_p8 +//************* Round ****************** +float32x4_t vrndnq_f32(float32x4_t a); +#ifdef USE_SSE4 +#define vrndnq_f32(a) _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) +#else +_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) +{ + int i; + _NEON2SSE_ALIGN_16 float32_t res[4]; + _mm_store_ps(res, a); + for(i = 0; i<4; i++) { + res[i] = nearbyintf(res[i]); + } + return _mm_load_ps(res); +} +#endif + + +float64x2_t vrndnq_f64(float64x2_t a); +#ifdef USE_SSE4 +#define vrndnq_f64(a) _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) +#else +_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) +{ + _NEON2SSE_ALIGN_16 float64_t res[2]; + _mm_store_pd(res, a); + res[0] = nearbyintf(res[0]); + res[1] = nearbyintf(res[1]); + return _mm_load_pd(res); +} +#endif + + + +//************* Sqrt ****************** +float32x4_t vsqrtq_f32(float32x4_t a); +#define vsqrtq_f32 _mm_sqrt_ps + +float64x2_t vsqrtq_f64(float64x2_t a); +#define vsqrtq_f64 _mm_sqrt_pd + + #endif /* NEON2SSE_H */ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d/ReadMe.md new/ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f/ReadMe.md --- old/ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d/ReadMe.md 2017-05-30 09:44:55.000000000 +0200 +++ new/ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f/ReadMe.md 2018-04-04 09:24:16.000000000 +0200 @@ -6,4 +6,6 @@ To take advantage of this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h", compile it as usual and enjoy the result. +For significant performance improvement in some cases you might need to define USE_SSE4 in your project settings. Otherwise SIMD up to SSSE3 to be used. + For more information and license please read the NEON_2_SSE.h content. ++++++ boring_ssl.tar.gz ++++++ /work/SRC/openSUSE:Factory/tensorflow/boring_ssl.tar.gz /work/SRC/openSUSE:Factory/.tensorflow.new.4126/boring_ssl.tar.gz differ: char 13, line 1 ++++++ eigen.tar.gz ++++++ ++++ 72880 lines of diff (skipped) ++++++ fix_mvapich_mpi_bzl.patch ++++++ diff --git a/third_party/mpi/mpi.bzl b/third_party/mpi/mpi.bzl index 3a48335..1cd43f8 100644 --- a/third_party/mpi/mpi.bzl +++ b/third_party/mpi/mpi.bzl @@ -2,7 +2,7 @@ #based on the configuration options return one or the other def mpi_hdr(): - MPI_LIB_IS_OPENMPI = True + MPI_LIB_IS_OPENMPI=True hdrs = [] if MPI_LIB_IS_OPENMPI: hdrs = ["mpi.h", "mpi_portable_platform.h"] #When using OpenMPI ++++++ license.rst.txt ++++++ ++++ 903 lines (skipped) ++++++ protobuf_v3.6.0.tar.gz -> protobuf_v3.6.1.2.tar.gz ++++++ ++++ 4997 lines of diff (skipped) ++++++ re2-2018-04-01.tar.gz -> re2-2018-10-01.tar.gz ++++++ ++++ 2612 lines of diff (skipped) ++++++ support-new-bazel.patch ++++++ --- a/configure.py.orig 2019-03-12 21:43:27.333211414 +0100 +++ a/configure.py 2019-03-12 21:43:50.225119652 +0100 @@ -1554,7 +1554,7 @@ # environment variables. environ_cp = dict(os.environ) - check_bazel_version('0.19.0', '0.21.0') + check_bazel_version('0.19.0', '0.22.0') reset_tf_configure_bazelrc() ++++++ tensorflow-1.10.0.tar.gz -> tensorflow-1.13.1.tar.gz ++++++ /work/SRC/openSUSE:Factory/tensorflow/tensorflow-1.10.0.tar.gz /work/SRC/openSUSE:Factory/.tensorflow.new.4126/tensorflow-1.13.1.tar.gz differ: char 12, line 1 ++++++ tensorflow-fix_lite.patch ++++++ --- tensorflow-1.13.1/tensorflow/lite/tools/make/Makefile.orig 2019-06-04 13:13:08.329080620 +0200 +++ tensorflow-1.13.1/tensorflow/lite/tools/make/Makefile 2019-06-04 16:05:13.325963284 +0200 @@ -38,11 +38,12 @@ INCLUDES := \ -I$(OBJDIR) # This is at the end so any globally-installed frameworks like protobuf don't # override local versions in the source tree. -INCLUDES += -I/usr/local/include +INCLUDES += -I/usr/include # These are the default libraries needed, but they can be added to or # overridden by the platform-specific settings in target makefiles. LIBS := \ +-lflatbuffers \ -lstdc++ \ -lpthread \ -lm \ ++++++ tensorflow-make_aws_sdk_work_on_aarch64.patch ++++++ >From 3f88ddb71ba49d343a5db1304c296e78ddeb2575 Mon Sep 17 00:00:00 2001 From: Koan-Sin Tan <[email protected]> Date: Wed, 10 Oct 2018 02:34:02 +0000 Subject: [PATCH] [aarch64] make aws sdk work on aarch64 `bazel build //tensorflow/tools/pip_package:build_pip_package' requires AWS SDK by default. but platform part was not built on aarch64 --- tensorflow/BUILD | 6 ++++++ third_party/aws/BUILD.bazel | 3 +++ 2 files changed, 9 insertions(+) diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 9b62a504525d..8486922e00b0 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -163,6 +163,12 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "linux_aarch64", + values = {"cpu": "aarch64"}, + visibility = ["//visibility:public"], +) + config_setting( name = "linux_x86_64", values = {"cpu": "k8"}, diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel index 5426f79e4650..66baa8fdf3b7 100644 --- a/third_party/aws/BUILD.bazel +++ b/third_party/aws/BUILD.bazel @@ -12,6 +12,9 @@ load("@org_tensorflow//third_party:common.bzl", "template_rule") cc_library( name = "aws", srcs = select({ + "@org_tensorflow//tensorflow:linux_aarch64": glob([ + "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp", + ]), "@org_tensorflow//tensorflow:linux_x86_64": glob([ "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp", ]),
