This is an automated email from the ASF dual-hosted git repository.
samskalicky pushed a commit to branch v1.x
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/v1.x by this push:
new d2e6452 [1.x] Backport of intgemm #17559 (#19099)
d2e6452 is described below
commit d2e6452042b1bad5ae9b18f90863cbf6f13aab29
Author: kpuatamazon <[email protected]>
AuthorDate: Wed Sep 16 17:41:35 2020 +0100
[1.x] Backport of intgemm #17559 (#19099)
* cherry-pick intgemm from master, fix build
* Fix test to conform to 1.x
* Makefile supporting intgemm compilation
* Stricter dependencies on git checkout of intgemm
* Operators depend on mkldnn
* Don't compile intgemm with gcc older than 5
* Fix intgemm test for windows on 1.x by not using pytest
* Update intgemm to use template arguments for integer immediates
* Try to fix clang3.6
* Ban gcc < 5 in cmake
* Update intgemm with gcc 5.5 debug workaround
---
CMakeLists.txt | 30 ++
LICENSE | 2 +
Makefile | 66 +++++
include/mxnet/base.h | 2 +-
.../contrib/intgemm/intgemm_fully_connected_op.cc | 328 +++++++++++++++++++++
src/operator/contrib/intgemm/max_absolute_op.cc | 119 ++++++++
src/operator/contrib/intgemm/prepare_data_op.cc | 134 +++++++++
src/operator/contrib/intgemm/prepare_weight_op.cc | 180 +++++++++++
src/operator/contrib/intgemm/take_weight_op.cc | 146 +++++++++
src/storage/cpu_device_storage.h | 2 +-
tests/python/unittest/test_contrib_intgemm.py | 221 ++++++++++++++
11 files changed, 1228 insertions(+), 2 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c7cfe1..ee4369a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,11 @@ if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND
(CMAKE_HOST_SYSTEM_PR
else()
option(USE_MKLDNN "Build with MKL-DNN support" OFF)
endif()
+if ((CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64) AND ((NOT
CMAKE_COMPILER_IS_GNUCC) OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL
5.0)))
+ option(USE_INTGEMM "Build with x86_64 intgemm library for low-precision
multiplication" ON)
+else()
+ option(USE_INTGEMM "Build with x86_64 intgemm library for low-precision
multiplication" OFF)
+endif()
if(NOT MSVC)
option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON)
else()
@@ -306,6 +311,22 @@ if(USE_CPP_PACKAGE)
add_definitions(-DMXNET_USE_CPP_PACKAGE=1)
endif()
+if(USE_INTGEMM)
+ message(STATUS "Using intgemm")
+ include(FetchContent)
+ FetchContent_Declare(
+ intgemm
+ GIT_REPOSITORY https://github.com/kpu/intgemm.git
+ GIT_TAG 4172dcc209e6793dd920dec9cf9c9fc81605bd9d
+ )
+ FetchContent_GetProperties(intgemm)
+ if(NOT intgemm_POPULATED)
+ FetchContent_Populate(intgemm)
+ endif()
+ add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR}
EXCLUDE_FROM_ALL)
+ add_definitions(-DMXNET_USE_INTGEMM=1)
+endif()
+
# Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
@@ -497,6 +518,11 @@ endif()
FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
+if(NOT USE_INTGEMM)
+ FILE(GLOB_RECURSE INTGEMM_OPERATOR_SOURCE
"src/operator/contrib/intgemm/*.cc" "src/operator/contrib/intgemm/*.h")
+ list(REMOVE_ITEM SOURCE ${INTGEMM_OPERATOR_SOURCE})
+endif()
+
# add nnvm to source
FILE(GLOB_RECURSE NNVMSOURCE
3rdparty/tvm/nnvm/src/c_api/*.cc
@@ -791,6 +817,10 @@ if(USE_MKLDNN)
${CMAKE_BINARY_DIR}/3rdparty/mkldnn/include/dnnl_version.h
${CMAKE_SOURCE_DIR}/include/mkldnn/)
endif()
+if(USE_INTGEMM)
+ target_link_libraries(mxnet_static PRIVATE intgemm)
+endif()
+
function(BuildTVMOP)
# scope the variables in BuildTVM.cmake to avoid conflict
include(cmake/BuildTVM.cmake)
diff --git a/LICENSE b/LICENSE
index 9aa20d1..4a8f8dd 100644
--- a/LICENSE
+++ b/LICENSE
@@ -309,6 +309,8 @@
Licensed MIT © Zeno Rocha
11. mx-theme - For details, see docs/python_docs/themes/mx-theme/LICENSE
Copyright (c) 2016 myyasuda
+ 12. intgemm - Refer to 3rdparty/intgemm/LICENSE
+ Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev,
Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation
=======================================================================================
diff --git a/Makefile b/Makefile
index 4ee71c9..aa41207 100644
--- a/Makefile
+++ b/Makefile
@@ -86,6 +86,25 @@ ifeq ($(USE_MKLDNN), 1)
MKLDNNROOT = $(ROOTDIR)/3rdparty/mkldnn/build/install
endif
+ifndef USE_INTGEMM
+ifeq ($(UNAME_P), x86_64)
+ COMPILER := $(shell $(CXX) --version |head -n 1 |cut -d " " -f 1)
+ COMPILER_VERSION := $(shell $(CXX) -dumpversion |cut -d . -f 1)
+ ifeq ($(COMPILER), clang)
+ USE_INTGEMM=1
+ endif
+ ifeq ($(COMPILER), Apple)
+ USE_INTGEMM=1
+ endif
+ # If it's not clang and not Apple clang, it's probably gcc and we need
at least 5.
+ # gcc --version gives the name of the program it was called with, which
makes it hard to detect.
+ COMPILER_VERSION_GE_5 := $(shell expr $(COMPILER_VERSION) \>= 5)
+ ifeq ($(COMPILER_VERSION_GE_5), 1)
+ USE_INTGEMM=1
+ endif
+endif
+endif
+
include $(TPARTYDIR)/mshadow/make/mshadow.mk
include $(DMLC_CORE)/make/dmlc.mk
@@ -463,6 +482,46 @@ endif
all: lib/libmxnet.a lib/libmxnet.so $(BIN) extra-packages extension_libs
SRC = $(wildcard src/*/*/*/*.cc src/*/*/*.cc src/*/*.cc src/*.cc)
+
+ifeq ($(USE_INTGEMM), 1)
+ ifndef INTGEMM_PATH
+ INTGEMM_PATH = build/3rdparty/intgemm
+ endif
+ CFLAGS += -DMXNET_USE_INTGEMM=1
+ LIB_DEP += $(INTGEMM_PATH)/libintgemm.a
+
+# Download intgemm if it isn't already
+$(INTGEMM_PATH):
+ @mkdir -p $(INTGEMM_PATH)
+ rm -rf $(INTGEMM_PATH)
+ git clone https://github.com/kpu/intgemm $(INTGEMM_PATH)
+ cd $(INTGEMM_PATH) && git checkout -q
4172dcc209e6793dd920dec9cf9c9fc81605bd9d
+
+$(INTGEMM_PATH)/compile_test_avx512bw.cc: $(INTGEMM_PATH)
+ @
+$(INTGEMM_PATH)/compile_test_avx512vnni.cc: $(INTGEMM_PATH)
+ @
+$(INTGEMM_PATH)/intgemm/intgemm.cc: $(INTGEMM_PATH)
+ @
+
+# Compiler tests for AVX512BW and AVX512VNNI.
+$(INTGEMM_PATH)/intgemm/intgemm_config.h:
$(INTGEMM_PATH)/compile_test_avx512bw.cc
$(INTGEMM_PATH)/compile_test_avx512vnni.cc
+ echo '#pragma once' >$(INTGEMM_PATH)/intgemm/intgemm_config.h
+ $(CXX) $(CFLAGS) $(INTGEMM_PATH)/compile_test_avx512bw.cc 2>/dev/null
&& echo \#define INTGEMM_COMPILER_SUPPORTS_AVX512BW
>>$(INTGEMM_PATH)/intgemm/intgemm_config.h || echo Your compiler is missing
AVX512BW support
+ $(CXX) $(CFLAGS) $(INTGEMM_PATH)/compile_test_avx512vnni.cc 2>/dev/null
&& echo \#define INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
>>$(INTGEMM_PATH)/intgemm/intgemm_config.h || echo Your compiler is missing
AVX512VNNI support
+
+$(INTGEMM_PATH)/intgemm/intgemm.o: $(INTGEMM_PATH)/intgemm/intgemm_config.h
$(INTGEMM_PATH)/intgemm/intgemm.cc $(wildcard $(INTGEMM_PATH)/intgemm/*.h
$(INTGEMM_PATH)/intgemm/*/*.h)
+ $(CXX) $(CFLAGS) -I$(INTGEMM_PATH) -std=c++11 -c
$(INTGEMM_PATH)/intgemm/intgemm.cc -o $@
+
+$(INTGEMM_PATH)/libintgemm.a: $(INTGEMM_PATH)/intgemm/intgemm.o
+ @mkdir -p $(@D)
+ ar crv $@ $(filter %.o, $?)
+else
+ #If we're not using intgemm, remove the operators from src.
+ INTGEMM_OPS := $(wildcard src/operator/contrib/intgemm/*.cc)
+ SRC := $(filter-out $(INTGEMM_OPS),$(SRC))
+endif
+
OBJ = $(patsubst %.cc, build/%.o, $(SRC))
CUSRC = $(wildcard src/*/*/*/*.cu src/*/*/*.cu src/*/*.cu src/*.cu)
CUOBJ = $(patsubst %.cu, build/%_gpu.o, $(CUSRC))
@@ -560,6 +619,13 @@ endif
# For quick compile test, used smaller subset
ALLX_DEP= $(ALL_DEP)
+ifeq ($(USE_INTGEMM), 1)
+# Enforce a dependency on $(INTGEMM_PATH)/intgemm/intgemm_config.h which is a
generated header based on compiler support.
+build/src/operator/contrib/intgemm/%.o: src/operator/contrib/intgemm/%.cc
$(INTGEMM_PATH)/intgemm/intgemm_config.h | mkldnn
+ @mkdir -p $(@D)
+ $(CXX) -std=c++11 -c $(CFLAGS) -MMD -I$(INTGEMM_PATH) -Isrc/operator -c
$< -o $@
+endif
+
build/src/%.o: src/%.cc | mkldnn
@mkdir -p $(@D)
$(CXX) -std=c++11 -c $(CFLAGS) -MMD -c $< -o $@
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 1980ca5..c2f4638 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -551,7 +551,7 @@ inline std::ostream& operator<<(std::ostream &out, const
Context &ctx) {
#define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
constexpr size_t kMKLDNNAlign = 64;
#endif
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
new file mode 100644
index 0000000..216f5ce
--- /dev/null
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -0,0 +1,328 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file intgemm_fully_connected_op.cc
+ * \brief Operator wrapping intgemm's Multiply routine
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <cstdlib>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct IntgemmFullyConnectedParam : public
dmlc::Parameter<IntgemmFullyConnectedParam> {
+ int out_type;
+ int num_hidden;
+ bool no_bias;
+ bool flatten;
+ DMLC_DECLARE_PARAMETER(IntgemmFullyConnectedParam) {
+ // This part os a copy of the FullyConnected parameters.
+ DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
+ .describe("Number of hidden nodes of the output.");
+ DMLC_DECLARE_FIELD(no_bias).set_default(false)
+ .describe("Whether to disable bias parameter.");
+ DMLC_DECLARE_FIELD(flatten).set_default(true)
+ .describe("Whether to collapse all but the first axis of the input data
tensor.");
+
+ DMLC_DECLARE_FIELD(out_type)
+ .add_enum("float32", mshadow::kFloat32)
+ .add_enum("int32", mshadow::kInt32)
+ .set_default(mshadow::kFloat32)
+ .describe("Output data type.");
+ }
+};
+DMLC_REGISTER_PARAMETER(IntgemmFullyConnectedParam);
+
+namespace {
+// Parse the above fields into indices for parameters.
+// The order is: data weight [scaling] [bias].
+struct ParameterIndices {
+ explicit ParameterIndices(const IntgemmFullyConnectedParam& param) :
+ data(0),
+ weight(1),
+ scaling(param.out_type == mshadow::kFloat32 ? 2 : kInvalid),
+ bias(param.no_bias ? kInvalid : (HaveScaling() ? 3 : 2)),
+ count(2U + HaveScaling() + HaveBias()) {}
+ bool HaveScaling() const { return scaling != kInvalid; }
+ bool HaveBias() const { return bias != kInvalid; }
+ const unsigned int data;
+ const unsigned int weight;
+ const unsigned int scaling;
+ const unsigned int bias;
+ const unsigned int count;
+ static const unsigned int kInvalid = std::numeric_limits<unsigned
int>::max();
+};
+template<class T> ParameterIndices Sanity(const nnvm::NodeAttrs& attrs,
+ T* in,
+ T* out) {
+ // 3-4 parameters: A, B, scaling, and optional bias
+ ParameterIndices ret(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
+ CHECK_EQ(in->size(), ret.count);
+ CHECK_EQ(out->size(), 1U);
+ return ret;
+}
+} // namespace
+
+inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
+ mxnet::ShapeVector* in_shape,
+ mxnet::ShapeVector* out_shape) {
+ const ParameterIndices indices(Sanity(attrs, in_shape, out_shape));
+ const IntgemmFullyConnectedParam& param =
nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+ // This follows FullyConnectedShape except for scaling.
+ using namespace mshadow;
+ mxnet::TShape dshape = (*in_shape)[indices.data];
+ mxnet::TShape oshape = (*out_shape)[0];
+ // require data to be known
+ if (!mxnet::ndim_is_known(dshape)) return false;
+
+ index_t num_input;
+ if (!param.flatten) {
+ num_input = dshape[dshape.ndim()-1];
+ } else {
+ num_input = dshape.ProdShape(1, dshape.ndim());
+ }
+ SHAPE_ASSIGN_CHECK(*in_shape, indices.weight, Shape2(param.num_hidden,
num_input));
+ if (indices.HaveScaling()) {
+ SHAPE_ASSIGN_CHECK(*in_shape, indices.scaling, mxnet::TShape(1, 1));
+ }
+ if (indices.HaveBias()) {
+ if (!shape_assign(&(*in_shape)[indices.bias], Shape1(param.num_hidden)) &&
+ !shape_assign(&(*in_shape)[indices.bias], Shape2(param.num_hidden,
1))) {
+ LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[indices.bias];
+ }
+ }
+
+ if (!param.flatten) {
+ mxnet::TShape result_shape(dshape);
+ result_shape[dshape.ndim()-1] = param.num_hidden;
+ SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+ } else {
+ SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
+ }
+ if (oshape.ndim() > 0) {
+ dshape[0] = oshape[0];
+ SHAPE_ASSIGN_CHECK(*in_shape, indices.data, dshape);
+ }
+ return true;
+}
+
+bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ const ParameterIndices indices(Sanity(attrs, in_attrs, out_attrs));
+ const IntgemmFullyConnectedParam& param =
nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+
+ // Match the configuration for output.
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_type);
+ if (indices.HaveBias()) {
+ // Bias has same type as output.
+ TYPE_ASSIGN_CHECK(*in_attrs, indices.bias, (*out_attrs)[0]);
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[indices.bias]);
+ }
+ // Scaling is float32.
+ if (indices.HaveScaling()) {
+ TYPE_ASSIGN_CHECK(*in_attrs, indices.scaling, mshadow::kFloat32);
+ }
+ // Users have to prepare B. It wasn't intended to be efficient.
+ TYPE_ASSIGN_CHECK(*in_attrs, indices.weight, mshadow::kInt8);
+ // A can be a float (in which case it is automatically quantized) or int8.
+ if (type_is_none((*in_attrs)[indices.data])) {
+ return false;
+ }
+ return ((*in_attrs)[indices.data] == mshadow::kInt8 ||
+ (*in_attrs)[indices.data] == mshadow::kFloat32);
+}
+
+void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+ const ParameterIndices indices(Sanity(attrs, &inputs, &outputs));
+ const IntgemmFullyConnectedParam& param =
nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+ CHECK_EQ(req.size(), 1U);
+ CHECK_EQ(req[0], kWriteTo) << "TODO: doing more than overwriting for
intgemm.";
+
+ const TBlob &A = inputs[indices.data], &B = inputs[indices.weight], &C =
outputs[0];
+
+ CHECK(A.type_flag_ == mshadow::kInt8 || A.type_flag_ == mshadow::kFloat32);
+ CHECK_EQ(B.type_flag_, mshadow::kInt8);
+ CHECK(C.type_flag_ == mshadow::kInt32 || C.type_flag_ == mshadow::kFloat32);
+ CHECK(A.CheckContiguous());
+ CHECK(B.CheckContiguous());
+ CHECK(C.CheckContiguous());
+ CHECK_GE(A.shape_.ndim(), 1);
+ CHECK_GE(B.shape_.ndim(), 2);
+ size_t A_rows = A.shape_.ProdShape(0, A.shape_.ndim() - 1);
+ size_t inner = A.shape_[A.shape_.ndim() - 1];
+ CHECK_EQ(B.shape_[B.shape_.ndim() - 1], inner);
+ size_t B_cols = B.shape_.ProdShape(0, B.shape_.ndim() - 1);
+
+ CHECK_EQ(C.shape_.Size(), A_rows * B_cols);
+
+ bool bias = !param.no_bias;
+ if (bias) {
+ CHECK_EQ(inputs[indices.bias].type_flag_, C.type_flag_);
+ CHECK_EQ(inputs[indices.bias].shape_.Size(), param.num_hidden);
+ }
+ CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+ "intgemm requires the inner dimension be a multiple of " <<
::intgemm::Int8::tile_info.b_rows;
+ CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+ "intgemm requires B have a multiple of " <<
::intgemm::Int8::tile_info.b_cols <<
+ " columns in the equation C = AB.";
+
+ float out_float_multiplier;
+ if (indices.HaveScaling()) {
+ out_float_multiplier = *inputs[indices.scaling].dptr<float>();
+ } else {
+ out_float_multiplier = 0.0; // Unused; stop compiler from complaining.
+ }
+
+ int8_t *A_quant;
+ mshadow::Tensor<cpu, 1, int8_t> A_quant_store;
+ if (A.type_flag_ == mshadow::kFloat32) {
+ const float *A_raw = A.dptr<float>();
+ // Quantize A for the user.
+ // Future: allow scale to be passed in? Should the induced scale be an
output?
+ float scale = 127.0 / ::intgemm::MaxAbsolute(A_raw, A_raw +
A.shape_.Size());
+ out_float_multiplier /= scale;
+ A_quant_store = ctx.requested[0].get_space_typed<cpu, 1, int8_t>(
+ mshadow::Shape1(A.shape_.Size()),
+ ctx.get_stream<cpu>());
+ A_quant = A_quant_store.dptr_;
+ ::intgemm::Int8::PrepareA(A_raw, A_quant, scale, A_rows, inner);
+ } else {
+ CHECK_EQ(A.type_flag_, mshadow::kInt8);
+ A_quant = A.dptr<int8_t>();
+ }
+ const int8_t *B_quant = B.dptr<int8_t>();
+ CHECK_EQ(reinterpret_cast<intptr_t>(A_quant) % 64, 0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ CHECK_EQ(reinterpret_cast<intptr_t>(B_quant) % 64, 0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ if (C.type_flag_ == mshadow::kFloat32) {
+ CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<float>()) % 64, 0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ } else {
+ CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<int32_t>()) % 64, 0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ }
+
+ if (bias) {
+ if (C.type_flag_ == mshadow::kFloat32) {
+ CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<float>())
% 64, 0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(
+ out_float_multiplier,
+ inputs[indices.bias].dptr<float>(),
+ C.dptr<float>());
+ ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+ } else {
+ // int32
+
CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<int32_t>()) % 64,
0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ ::intgemm::callbacks::AddBiasAndWrite cb(
+ inputs[indices.bias].dptr<int32_t>(),
+ C.dptr<int32_t>());
+ ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+ }
+ } else {
+ if (C.type_flag_ == mshadow::kFloat32) {
+ ::intgemm::callbacks::UnquantizeAndWrite cb(out_float_multiplier,
C.dptr<float>());
+ ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+ } else {
+ // int32
+ ::intgemm::callbacks::Write<int32_t> cb(C.dptr<int32_t>());
+ ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+ }
+ }
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_fully_connected)
+.add_alias("_npx_intgemm_fully_connected")
+.describe(R"code(Multiply matrices using 8-bit integers. data * weight.
+
+Input tensor arguments are: data weight [scaling] [bias]
+
+data: either float32 or prepared using intgemm_prepare_data (in which case it
is int8).
+
+weight: must be prepared using intgemm_prepare_weight.
+
+scaling: present if and only if out_type is float32. If so this is multiplied
by the result before adding bias. Typically:
+scaling = (max passed to intgemm_prepare_weight)/127.0 if data is in float32
+scaling = (max_passed to intgemm_prepare_data)/127.0 * (max passed to
intgemm_prepare_weight)/127.0 if data is in int8
+
+bias: present if and only if !no_bias. This is added to the output after
scaling and has the same number of columns as the output.
+
+out_type: type of the output.
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<IntgemmFullyConnectedParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+ return
ParameterIndices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed)).count;
+})
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+ [](const NodeAttrs& attrs) {
+ std::vector<std::string> ret{"data", "weight"};
+ ParameterIndices
indices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
+ if (indices.HaveScaling()) {
+ ret.emplace_back("scaling");
+ }
+ if (indices.HaveBias()) {
+ ret.emplace_back("bias");
+ }
+ return ret;
+ })
+.set_attr<FResourceRequest>("FResourceRequest",
+ [](const NodeAttrs& attrs) {
+ return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+ })
+.set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
+.set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
+.set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)
+.add_argument(
+ "data",
+ "NDArray-or-Symbol",
+ "First argument to multiplication. Tensor of float32 (quantized on the
fly) or int8 from "
+ "intgemm_prepare_data. If you use a different quantizer, be sure to ban
-128. The last "
+ "dimension must be a multiple of 64.")
+.add_argument(
+ "weight",
+ "NDArray-or-Symbol",
+ "Second argument to multiplication. Tensor of int8 from
intgemm_prepare_weight. The last "
+ "dimension must be a multiple of 64. The product of non-last dimensions
must be a multiple "
+ "of 8.")
+.add_argument("scaling", "NDArray-or-Symbol", "Scaling factor to apply if
output type is float32.")
+.add_argument("bias", "NDArray-or-Symbol", "Bias term.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_arguments(IntgemmFullyConnectedParam::__FIELDS__());
+
+} // namespace op
+} // namespace mxnet
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc
b/src/operator/contrib/intgemm/max_absolute_op.cc
new file mode 100644
index 0000000..01e10b0
--- /dev/null
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file max_absolute_op.cc
+ * \brief Computes maximum absolute value of a tensor using intgemm
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool MaxAbsoluteOpShape(const nnvm::NodeAttrs& attrs,
+ mxnet::ShapeVector* in_attrs,
+ mxnet::ShapeVector* out_attrs) {
+ // One in, one out.
+ CHECK_EQ(in_attrs->size(), 1U);
+ CHECK_EQ(out_attrs->size(), 1U);
+
+ SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
+ return shape_is_known(in_attrs->at(0));
+}
+
+inline bool MaxAbsoluteOpType(const nnvm::NodeAttrs& attrs,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ CHECK_EQ(in_attrs->size(), 1U);
+ CHECK_EQ(out_attrs->size(), 1U);
+
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
+ TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+ return true;
+}
+
+inline bool MaxAbsoluteOpStorageType(const nnvm::NodeAttrs& attrs,
+ const int dev_mask,
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ *dispatch_mode = DispatchMode::kFCompute;
+ CHECK_EQ(in_attrs->size(), 1U);
+ CHECK_EQ(out_attrs->size(), 1U);
+ (*out_attrs)[0] = kDefaultStorage;
+ return true;
+}
+
+void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+ CHECK_EQ(inputs.size(), 1U);
+ CHECK_EQ(outputs.size(), 1U);
+ CHECK_EQ(req.size(), 1U);
+ const TBlob &in = inputs.front(), &out = outputs.front();
+ CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+ CHECK_EQ(out.type_flag_, mshadow::kFloat32);
+ CHECK(in.CheckContiguous());
+ CHECK(out.CheckContiguous());
+
+ const std::size_t size = in.shape_.Size();
+
+ const float *data = in.dptr<float>();
+ // To maintain alignment, be a multiple of AVX512 register size.
+ const std::size_t kMultiple = 512 / 8;
+ CHECK_EQ(reinterpret_cast<intptr_t>(data) % kMultiple, 0)
+ << "Data must be aligned to " << kMultiple << " bytes.";
+
+ float result = ::intgemm::MaxAbsolute(data, data + size);
+ KERNEL_ASSIGN(*out.dptr<float>(), req[0], result);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute)
+.add_alias("_npx_intgemm_maxabsolute")
+.describe(R"code(Compute the maximum absolute value in a tensor of float32
fast on a CPU. The tensor's total size must be a multiple of 16 and aligned to
a multiple of 64 bytes.
+mxnet.nd.contrib.intgemm_maxabsolute(arr) == arr.abs().max()
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+ [](const NodeAttrs& attrs) {
+ return std::vector<std::string>{"data"};
+ })
+.set_attr<mxnet::FInferShape>("FInferShape", MaxAbsoluteOpShape)
+.set_attr<nnvm::FInferType>("FInferType", MaxAbsoluteOpType)
+.set_attr<FInferStorageType>("FInferStorageType", MaxAbsoluteOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", MaxAbsoluteOpForwardCPU)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+ [](const NodeAttrs& attrs) {
+ return std::vector<std::pair<int, int> >{{0, 0}};
+ })
+.add_argument("data", "NDArray-or-Symbol", "Tensor to compute maximum absolute
value of");
+
+} // namespace op
+} // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc
b/src/operator/contrib/intgemm/prepare_data_op.cc
new file mode 100644
index 0000000..1d5719d
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_data_op.cc
+ * \brief Converts data aka A matrices (typically activations) to intgemm's
+ * representation for A in C=AB. This just quantizes to int8 and bans -128.
+ * The only difference from Quantize/QuantizeV2 is that it bans -128.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+bool PrepareDataOpShape(const nnvm::NodeAttrs& attrs,
+ mxnet::ShapeVector* in_attrs,
+ mxnet::ShapeVector* out_attrs) {
+ // data and maximum
+ CHECK_EQ(in_attrs->size(), 2U);
+ CHECK_EQ(out_attrs->size(), 1U);
+
+ SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+ SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+ SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+
+ return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareDataOpType(const nnvm::NodeAttrs& attrs,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ CHECK_EQ(in_attrs->size(), 2U);
+ CHECK_EQ(out_attrs->size(), 1U);
+
+ // This routine converts from float to int8 with a scaling factor
+ TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+ TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+ return true;
+}
+
+bool PrepareDataOpStorageType(const nnvm::NodeAttrs& attrs,
+ const int dev_mask,
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ CHECK_EQ(in_attrs->size(), 2U);
+ CHECK_EQ(out_attrs->size(), 1U);
+ STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+ STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+ STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+ DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+ return true;
+}
+
+void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+ CHECK_EQ(inputs.size(), 2U);
+ CHECK_EQ(outputs.size(), 1U);
+ CHECK_EQ(req.size(), 1U);
+ CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+ const TBlob &in = inputs[0], &out = outputs[0];
+
+ CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+ CHECK_EQ(out.type_flag_, mshadow::kInt8);
+ CHECK(in.CheckContiguous());
+ CHECK(out.CheckContiguous());
+
+ const float *A = in.dptr<float>();
+ int8_t *quantA = out.dptr<int8_t>();
+ CHECK_EQ(reinterpret_cast<intptr_t>(A) % 64, 0);
+ CHECK_EQ(reinterpret_cast<intptr_t>(quantA) % 64, 0);
+ const float multiplier = 127.0 / *inputs[1].dptr<float>();
+ ::intgemm::Int8::Quantize(A, quantA, multiplier, in.shape_.Size());
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_data)
+.add_alias("_npx_intgemm_prepare_data")
+.describe(R"code(This operator converts quantizes float32 to int8 while also
banning -128.
+
+It it suitable for preparing an data matrix for use by intgemm's C=data *
weights operation.
+
+The float32 values are scaled such that maxabs maps to 127. Typically maxabs =
maxabsolute(A).
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+ [](const NodeAttrs& attrs) {
+ return std::vector<std::string>{"data", "maxabs"};
+ })
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareDataOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareDataOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareDataOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", PrepareDataOpForwardCPU)
+.add_argument("data", "NDArray-or-Symbol", "Activation matrix to be prepared
for multiplication.")
+.add_argument(
+ "maxabs",
+ "NDArray-or-Symbol",
+ "Maximum absolute value to be used for scaling. (The values will be
multiplied by 127.0 / "
+ "maxabs.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+} // namespace op
+} // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc
b/src/operator/contrib/intgemm/prepare_weight_op.cc
new file mode 100644
index 0000000..ad106eb
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_weight_op.cc
+ * \brief Converts weight matrices to intgemm's representation.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct PrepareWeightParam : public dmlc::Parameter<PrepareWeightParam> {
+ bool already_quantized;
+ DMLC_DECLARE_PARAMETER(PrepareWeightParam) {
+ DMLC_DECLARE_FIELD(already_quantized).set_default(false)
+ .describe("Is the weight matrix already quantized?");
+ }
+};
+DMLC_REGISTER_PARAMETER(PrepareWeightParam);
+
+bool PrepareWeightOpShape(const nnvm::NodeAttrs& attrs,
+ mxnet::ShapeVector* in_attrs,
+ mxnet::ShapeVector* out_attrs) {
+ // Optimal maximum parameter.
+ CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+ CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+ CHECK_EQ(out_attrs->size(), 1U);
+
+ SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+ SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+ if (in_attrs->size() == 2U) {
+ SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+ }
+ return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareWeightOpType(const nnvm::NodeAttrs& attrs,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+ CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+ CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+ if (in_attrs->size() == 1U) {
+ TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+ } else if (in_attrs->size() == 2U) {
+ TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+ TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+ }
+ return true;
+}
+
+bool PrepareWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+ const int dev_mask,
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+ CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+ CHECK_EQ(out_attrs->size(), 1U);
+ STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+ STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+ if (in_attrs->size() == 2U) {
+ STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+ }
+ DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+ return true;
+}
+
+void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+ const PrepareWeightParam& params =
nnvm::get<PrepareWeightParam>(attrs.parsed);
+ CHECK_EQ(inputs.size(), params.already_quantized ? 1U : 2U);
+ CHECK_EQ(outputs.size(), 1U);
+ CHECK_EQ(req.size(), 1U);
+ CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+
+ const TBlob &in = inputs.front();
+ const TBlob &out = outputs.front();
+ CHECK_EQ(out.type_flag_, mshadow::kInt8);
+ CHECK(in.CheckContiguous());
+ CHECK(out.CheckContiguous());
+ size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1);
+ size_t inner = in.shape_[in.shape_.ndim() - 1];
+ CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+ "intgemm requires the inner dimension be a multiple of " <<
::intgemm::Int8::tile_info.b_rows;
+ CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+ "intgemm requires the output dimension (the product of all but the last
dimension of the "
+ "weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols
<< ".";
+
+ int8_t *quantB = out.dptr<int8_t>();
+ CHECK_EQ(reinterpret_cast<intptr_t>(quantB) % 64, 0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8)
<<
+ "Expected either 32-bit values to be quantized or 8-bit values to
rearrange.";
+ if (in.type_flag_ == mshadow::kInt8) {
+ const int8_t *B = in.dptr<int8_t>();
+ CHECK_EQ(reinterpret_cast<intptr_t>(B) % 64, 0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ ::intgemm::Int8::PrepareBQuantizedTransposed(B, quantB, inner, B_cols);
+ } else if (in.type_flag_ == mshadow::kFloat32) {
+ const float *B = in.dptr<float>();
+ CHECK_EQ(reinterpret_cast<intptr_t>(B) % 64, 0) <<
+ "Pointers should be aligned to a multiple of 64.";
+ ::intgemm::Int8::PrepareBTransposed(
+ B,
+ quantB,
+ 127.0 / *inputs[1].dptr<float>(),
+ inner,
+ B_cols);
+ }
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_weight)
+.add_alias("_npx_intgemm_prepare_weight")
+.describe(R"code(This operator converts a weight matrix in column-major format
to intgemm's internal fast representation of weight matrices. MXNet
customarily stores weight matrices in column-major (transposed) format. This
operator is not meant to be fast; it is meant to be run offline to quantize a
model.
+
+In other words, it prepares weight for the operation C = data * weight^T.
+
+If the provided weight matrix is float32, it will be quantized first. The
quantization function is (int8_t)(127.0 / max * weight) where multiplier is
provided as argument 1 (the weight matrix is argument 0). Then the matrix will
be rearranged into the CPU-dependent format.
+
+If the provided weight matrix is already int8, the matrix will only be
rearranged into the CPU-dependent format. This way one can quantize with
intgemm_prepare_data (which just quantizes), store to disk in a consistent
format, then at load time convert to CPU-dependent format with
intgemm_prepare_weight.
+
+The internal representation depends on register length. So AVX512, AVX2, and
SSSE3 have different formats. AVX512BW and AVX512VNNI have the same
representation.
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<PrepareWeightParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+ const PrepareWeightParam& params =
nnvm::get<PrepareWeightParam>(attrs.parsed);
+ return params.already_quantized ? 1 : 2;
+})
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
{
+ const PrepareWeightParam& params =
nnvm::get<PrepareWeightParam>(attrs.parsed);
+ return params.already_quantized ?
+ std::vector<std::string>{"weight"} : std::vector<std::string>{"weight",
"maxabs"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareWeightOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareWeightOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareWeightOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", PrepareWeightOpForwardCPU)
+.add_argument("weight", "NDArray-or-Symbol", "Parameter matrix to be prepared
for multiplication.")
+.add_argument(
+ "maxabs",
+ "NDArray-or-Symbol",
+ "Maximum absolute value for scaling. The weights will be multipled by
127.0 / maxabs.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_arguments(PrepareWeightParam::__FIELDS__());
+
+} // namespace op
+} // namespace mxnet
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc
b/src/operator/contrib/intgemm/take_weight_op.cc
new file mode 100644
index 0000000..09e320e
--- /dev/null
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file select_weight_op.cc
+ * \brief Takes from the all-but-last dimension of a tensor stored in
+ * intgemm's weight format. This is particularly useful for output matrices
where
+ * some outputs are excluded.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool TakeWeightOpShape(const nnvm::NodeAttrs& shape,
+ mxnet::ShapeVector* in_shape,
+ mxnet::ShapeVector* out_shape) {
+ // 0 is weight, 1 is indices.
+ CHECK_EQ(in_shape->size(), 2U);
+ CHECK_EQ(out_shape->size(), 1U);
+
+ mxnet::TShape &weight = (*in_shape)[0];
+ mxnet::TShape &indices = (*in_shape)[1];
+ mxnet::TShape &out = (*out_shape)[0];
+
+ // weight matrices should be 2-dimensional by now.
+ SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape(2, -1));
+ SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape(2, -1));
+ // indices are 1-dimensional.
+ SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape(1, -1));
+
+ SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape({indices[0], weight[1]}));
+ SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape({-1, out[1]}));
+ SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape({out[0]}));
+
+ return shape_is_known(weight) && shape_is_known(indices) &&
shape_is_known(out);
+}
+
+inline bool TakeWeightOpType(const nnvm::NodeAttrs& attrs,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ CHECK_EQ(in_attrs->size(), 2U);
+ CHECK_EQ(out_attrs->size(), 1U);
+
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+ TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+ TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt32);
+ return true;
+}
+
+inline bool TakeWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+ const int dev_mask,
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ *dispatch_mode = DispatchMode::kFCompute;
+ CHECK_EQ(in_attrs->size(), 2U);
+ CHECK_EQ(out_attrs->size(), 1U);
+ (*out_attrs)[0] = kDefaultStorage;
+ return true;
+}
+
+void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+ CHECK_EQ(inputs.size(), 2U);
+ CHECK_EQ(outputs.size(), 1U);
+ CHECK_EQ(req.size(), 1U);
+ CHECK_EQ(req[0], kWriteTo) << "TODO request types other than write";
+ const TBlob &weight = inputs.front(), &indices = inputs[1], &out =
outputs.front();
+ CHECK_EQ(weight.type_flag_, mshadow::kInt8);
+ CHECK_EQ(indices.type_flag_, mshadow::kInt32);
+ CHECK_EQ(out.type_flag_, mshadow::kInt8);
+ CHECK(weight.CheckContiguous());
+ CHECK(indices.CheckContiguous());
+ CHECK(out.CheckContiguous());
+ size_t B_cols = indices.shape_[0];
+ size_t inner = weight.shape_[weight.shape_.ndim() - 1];
+ CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+ "intgemm requires the inner dimension be a multiple of " <<
::intgemm::Int8::tile_info.b_rows;
+ CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+ "For efficiency, intgemm requires there to be a multiple of " <<
+ ::intgemm::Int8::tile_info.b_cols << " indices.";
+ // mxnet doesn't have a uint32_t type so we'll just pointer cast. But check
the sizes are the
+ // same. Ideally this should be static.
+ assert(sizeof(int32_t) == sizeof(::intgemm::Index));
+ const ::intgemm::Index *index =
+ reinterpret_cast<const ::intgemm::Index*>(indices.dptr<int32_t>());
+
+ ::intgemm::Int8::SelectColumnsB(
+ weight.dptr<int8_t>(),
+ out.dptr<int8_t>(),
+ inner,
+ index,
+ index + B_cols);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_take_weight)
+.add_alias("_npx_intgemm_take_weight")
+.describe(R"code(Index a weight matrix stored in intgemm's weight format.
+The indices select the outputs of matrix multiplication, not the inner dot
product dimension.
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+ [](const NodeAttrs& attrs) {
+ return std::vector<std::string>{"weight", "indices"};
+ })
+.set_attr<mxnet::FInferShape>("FInferShape", TakeWeightOpShape)
+.set_attr<nnvm::FInferType>("FInferType", TakeWeightOpType)
+.set_attr<FInferStorageType>("FInferStorageType", TakeWeightOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", TakeWeightOpForwardCPU)
+.add_argument(
+ "weight",
+ "NDArray-or-Symbol",
+ "Tensor already in intgemm weight format to select from")
+.add_argument("indices", "NDArray-or-Symbol", "indices to select on the 0th
dimension of weight");
+
+} // namespace op
+} // namespace mxnet
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index f6b296a..50cc4ab 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -53,7 +53,7 @@ class CPUDeviceStorage {
/*!
* \brief Alignment of allocation.
*/
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
// MKLDNN requires special alignment. 64 is used by the MKLDNN library in
// memory allocation.
static constexpr size_t alignment_ = kMKLDNNAlign;
diff --git a/tests/python/unittest/test_contrib_intgemm.py
b/tests/python/unittest/test_contrib_intgemm.py
new file mode 100644
index 0000000..69fa5e0
--- /dev/null
+++ b/tests/python/unittest/test_contrib_intgemm.py
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import np, npx
+from mxnet.test_utils import same, use_np, assert_almost_equal
+from common import with_seed
+import random
+from itertools import product
+
+
+# with_seed() from MXNet 1.x breaks @pytest.mark.parametrize so all randomized
+# tests use a for loop over a Cartesian product of parameters.
+
+@use_np
+@with_seed()
+def test_contrib_intgemm_maxabsolute():
+ if "intgemm_maxabsolute" not in dir(mx.nd.contrib):
+ return
+ for shape in ([(3, 2), (9,17), (2, 7, 1, 8)] + [(i,) for i in
range(1,65)]):
+ # mx.nd API
+ m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape)
+ fast = mx.nd.contrib.intgemm_maxabsolute(m)
+ slow = mx.nd.max(mx.nd.abs(m))
+ assert same(fast, slow)
+ # np API
+ m = np.random.uniform(low=-100.0, high=100.0, size=shape)
+ fast = npx.intgemm_maxabsolute(m).reshape(())
+ slow = np.max(np.abs(m))
+ assert same(fast, slow)
+
+@use_np
+@with_seed()
+def test_contrib_intgemm_prepare_data():
+ if "intgemm_prepare_data" not in dir(mx.nd.contrib):
+ return
+ for shape, max_quant in product([(i,) for i in range(1, 67)] + [(2,3),
(130, 12)], [2.0, 2.5]):
+ m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
+ scaled = m * 127.0 / max_quant
+ # Rounding 0.5 can go up or down. Move values away from 0.5.
+ too_close = mx.nd.abs(mx.nd.round(scaled) - scaled) > 0.45
+ m += max_quant / 127.0 * 0.05 * too_close
+
+ # Reference: scale and round
+ ref = mx.nd.round(m * 127.0 / max_quant)
+ # Clip to [-127, 127]. Because otherwise e.g. -129 casts to +127.
+ ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127.0]))
+ ref = mx.nd.broadcast_minimum(ref, mx.nd.array([127.0]))
+ # Reference: cast to int8
+ ref = mx.nd.cast(ref, dtype='int8')
+ # Reference: ban -128
+ ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127], dtype = 'int8'))
+
+ test = mx.nd.contrib.intgemm_prepare_data(m, mx.nd.array([max_quant]))
+ assert same(test, ref)
+ test = npx.intgemm_prepare_data(m.as_np_ndarray(),
np.array([max_quant]))
+ assert same(test, ref.as_np_ndarray())
+
+@use_np
+@with_seed()
+def test_contrib_intgemm_weight_consistent():
+ # The weight format is actually CPU-dependent so we don't directly test the
+ # output, but indirectly test that it works.
+ if "intgemm_prepare_weight" not in dir(mx.nd.contrib):
+ return
+ for shape, max_quant, api in product(
+ [(8, 64), (16, 64), (8, 128), (16, 128), (2, 4, 64)],
+ [0.2, 3.0],
+ [(mx.nd.contrib, mx.nd), (npx, np)]):
+ contrib, top = api
+ max_array = top.array([max_quant])
+ if top == mx.nd:
+ m = top.random_uniform(low=-3.0, high=3.0, shape=shape)
+ else:
+ m = np.random.uniform(size=shape)
+ direct = contrib.intgemm_prepare_weight(m, max_array)
+ quant = contrib.intgemm_prepare_data(m, max_array)
+ indirect = contrib.intgemm_prepare_weight(quant,
already_quantized=True)
+ # Should get the same data from direct call and already_quantized
version.
+ assert same(direct, indirect)
+
+@use_np
+@with_seed()
+def test_contrib_intgemm_take_weight():
+ if "intgemm_take_weight" not in dir(mx.nd.contrib):
+ return
+ test_indices = [
+ [0,1,2,3,4,5,6,7],
+ [1,2,1,2,1,2,1,2],
+ [7,6,5,4,3,2,1,0],
+ [3,1,4,1,5,9,2,6],
+ # Since random_uniform doesn't support int8, use python
+ [random.randint(0,15) for i in range(8)],
+ [random.randint(0,15) for i in range(16)],
+ [random.randint(0,15) for i in range(24)]
+ ]
+ for indices, api in product(test_indices, [(mx.nd.contrib, mx.nd), (npx,
np)]):
+ contrib, top = api
+ m = top.array([random.randint(-127,127) for i in range(16 * 64)],
dtype='int8')
+ m = m.reshape((16, 64))
+ indices = top.array(indices, dtype='int32')
+ # Prepare weight then take.
+ test = contrib.intgemm_prepare_weight(m, already_quantized=True)
+ test = contrib.intgemm_take_weight(test, indices)
+ # Take then prepare.
+ ref = m.take(indices, axis=0)
+ ref = contrib.intgemm_prepare_weight(ref, already_quantized=True)
+ assert same(test, ref)
+
+@use_np
+def test_contrib_intgemm_multiply():
+ if "intgemm_fully_connected" not in dir(mx.nd.contrib):
+ return
+ apis = [(mx.nd.contrib, mx.nd, mx.nd.FullyConnected, mx.nd.cast), (npx,
np, npx.fully_connected, npx.cast)]
+ for data_rows, inner, weight_cols, api in product(range(1, 5),
+ range(64, 256, 64),
+ range(8, 24, 8),
+ apis):
+ contrib, top, fully_connected, cast = api
+ #The multiplication routine has approximations so everything is tested
+ #deterministically to ensure bounds are met.
+ random.seed(1)
+
+ # Don't use full range (-127, 127) to avoid saturation.
+ data = [random.randint(-64, 64) for i in range(data_rows * inner)]
+ data = top.array(data, dtype='int8').reshape((data_rows, inner))
+ weight = [random.randint(-64, 64) for i in range(inner * weight_cols)]
+ weight = top.array(weight, dtype='int8').reshape((weight_cols, inner))
+ weight_prepared = contrib.intgemm_prepare_weight(weight,
already_quantized=True)
+
+ # int32 output, no bias
+ test = contrib.intgemm_fully_connected(data,
+ weight_prepared,
+ no_bias=True,
+ flatten=False,
+ out_type='int32',
+ num_hidden=weight_cols)
+ ref = fully_connected(cast(data, dtype='float32'),
+ cast(weight, dtype='float32'),
+ no_bias=True,
+ flatten=False,
+ num_hidden=weight_cols)
+ assert_almost_equal(cast(test, dtype='float32').as_nd_ndarray(),
ref.as_nd_ndarray(), rtol=0.01, atol=0.01)
+
+ # float32 output, no bias
+ scale = 3.0
+ test = contrib.intgemm_fully_connected(data,
+ weight_prepared,
+ top.array([scale]),
+ no_bias=True,
+ flatten=False,
+ out_type='float32',
+ num_hidden=weight_cols)
+ assert_almost_equal(test.as_nd_ndarray(), (ref *
scale).as_nd_ndarray(), rtol=0.01, atol=0.01)
+
+ # int32 output, bias
+ bias = top.array([random.randint(-60000, 60000) for i in
range(weight_cols)], dtype = 'int32')
+ test = contrib.intgemm_fully_connected(data,
+ weight_prepared,
+ bias,
+ no_bias=False,
+ flatten=False,
+ out_type='int32',
+ num_hidden=weight_cols)
+ ref = fully_connected(cast(data, dtype='float32'),
+ cast(weight, dtype='float32'),
+ cast(bias, dtype='float32'),
+ no_bias=False,
+ flatten=False,
+ num_hidden=weight_cols)
+ assert_almost_equal(cast(test, dtype='float32').as_nd_ndarray(),
ref.as_nd_ndarray(), rtol=0.01, atol=0.01)
+
+ # float32 output, bias
+ # Scaling is applied before bias (and bias is not scaled). So to make
the
+ # reference comparison easy, just scale the bias beforehand.
+ test = contrib.intgemm_fully_connected(data,
+ weight_prepared,
+ top.array([scale]),
+ cast(bias, dtype='float32') *
scale,
+ no_bias=False,
+ flatten=False,
+ out_type='float32',
+ num_hidden=weight_cols)
+ assert_almost_equal(test.as_nd_ndarray(), (ref *
scale).as_nd_ndarray(), rtol=0.01, atol=0.01)
+
+ # float32 input should work the same as manually prepared int8 input.
+ data_float = top.array([random.uniform(-3.14, 3.14) for i in
range(data_rows * inner)])
+ data_float = data_float.reshape(data_rows, inner)
+ direct = contrib.intgemm_fully_connected(data_float,
+ weight_prepared,
+ top.array([scale]),
+ cast(bias, dtype='float32'),
+ no_bias=False,
+ flatten=False,
+ out_type='float32',
+ num_hidden=weight_cols)
+ maxabs = contrib.intgemm_maxabsolute(data_float)
+ data_prepared = contrib.intgemm_prepare_data(data_float, maxabs)
+ cooked = contrib.intgemm_fully_connected(data_prepared,
+ weight_prepared,
+ top.array(scale * maxabs /
127.0),
+ cast(bias, dtype='float32'),
+ no_bias=False,
+ flatten=False,
+ out_type='float32',
+ num_hidden=weight_cols)
+ assert_almost_equal(direct.as_nd_ndarray(), cooked.as_nd_ndarray(),
rtol=0.01, atol=0.01)