[incubator-mxnet] branch v1.x updated: [1.x] Backport of intgemm #17559 (#19099)

samskalicky Wed, 16 Sep 2020 09:44:19 -0700

This is an automated email from the ASF dual-hosted git repository.

samskalicky pushed a commit to branch v1.x
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/v1.x by this push:
     new d2e6452  [1.x] Backport of intgemm #17559 (#19099)
d2e6452 is described below

commit d2e6452042b1bad5ae9b18f90863cbf6f13aab29
Author: kpuatamazon <[email protected]>
AuthorDate: Wed Sep 16 17:41:35 2020 +0100

    [1.x] Backport of intgemm #17559 (#19099)
    
    * cherry-pick intgemm from master, fix build
    
    * Fix test to conform to 1.x
    
    * Makefile supporting intgemm compilation
    
    * Stricter dependencies on git checkout of intgemm
    
    * Operators depend on mkldnn
    
    * Don't compile intgemm with gcc older than 5
    
    * Fix intgemm test for windows on 1.x by not using pytest
    
    * Update intgemm to use template arguments for integer immediates
    
    * Try to fix clang3.6
    
    * Ban gcc < 5 in cmake
    
    * Update intgemm with gcc 5.5 debug workaround
---
 CMakeLists.txt                                     |  30 ++
 LICENSE                                            |   2 +
 Makefile                                           |  66 +++++
 include/mxnet/base.h                               |   2 +-
 .../contrib/intgemm/intgemm_fully_connected_op.cc  | 328 +++++++++++++++++++++
 src/operator/contrib/intgemm/max_absolute_op.cc    | 119 ++++++++
 src/operator/contrib/intgemm/prepare_data_op.cc    | 134 +++++++++
 src/operator/contrib/intgemm/prepare_weight_op.cc  | 180 +++++++++++
 src/operator/contrib/intgemm/take_weight_op.cc     | 146 +++++++++
 src/storage/cpu_device_storage.h                   |   2 +-
 tests/python/unittest/test_contrib_intgemm.py      | 221 ++++++++++++++
 11 files changed, 1228 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c7cfe1..ee4369a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,11 @@ if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND 
(CMAKE_HOST_SYSTEM_PR
 else()
   option(USE_MKLDNN "Build with MKL-DNN support" OFF)
 endif()
+if ((CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64) AND ((NOT 
CMAKE_COMPILER_IS_GNUCC) OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 
5.0)))
+  option(USE_INTGEMM "Build with x86_64 intgemm library for low-precision 
multiplication" ON)
+else()
+  option(USE_INTGEMM "Build with x86_64 intgemm library for low-precision 
multiplication" OFF)
+endif()
 if(NOT MSVC)
   option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON)
 else()
@@ -306,6 +311,22 @@ if(USE_CPP_PACKAGE)
     add_definitions(-DMXNET_USE_CPP_PACKAGE=1)
 endif()
 
+if(USE_INTGEMM)
+  message(STATUS "Using intgemm")
+  include(FetchContent)
+  FetchContent_Declare(
+    intgemm
+    GIT_REPOSITORY https://github.com/kpu/intgemm.git
+    GIT_TAG        4172dcc209e6793dd920dec9cf9c9fc81605bd9d
+  )
+  FetchContent_GetProperties(intgemm)
+  if(NOT intgemm_POPULATED)
+    FetchContent_Populate(intgemm)
+  endif()
+  add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR} 
EXCLUDE_FROM_ALL)
+  add_definitions(-DMXNET_USE_INTGEMM=1)
+endif()
+
 # Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
@@ -497,6 +518,11 @@ endif()
 FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
 FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
+if(NOT USE_INTGEMM)
+  FILE(GLOB_RECURSE INTGEMM_OPERATOR_SOURCE 
"src/operator/contrib/intgemm/*.cc" "src/operator/contrib/intgemm/*.h")
+  list(REMOVE_ITEM SOURCE ${INTGEMM_OPERATOR_SOURCE})
+endif()
+
 # add nnvm to source
 FILE(GLOB_RECURSE NNVMSOURCE
   3rdparty/tvm/nnvm/src/c_api/*.cc
@@ -791,6 +817,10 @@ if(USE_MKLDNN)
       ${CMAKE_BINARY_DIR}/3rdparty/mkldnn/include/dnnl_version.h  
${CMAKE_SOURCE_DIR}/include/mkldnn/)
 endif()
 
+if(USE_INTGEMM)
+  target_link_libraries(mxnet_static PRIVATE intgemm)
+endif()
+
 function(BuildTVMOP)
   # scope the variables in BuildTVM.cmake to avoid conflict
   include(cmake/BuildTVM.cmake)
diff --git a/LICENSE b/LICENSE
index 9aa20d1..4a8f8dd 100644
--- a/LICENSE
+++ b/LICENSE
@@ -309,6 +309,8 @@
          Licensed MIT © Zeno Rocha
     11. mx-theme - For details, see docs/python_docs/themes/mx-theme/LICENSE
          Copyright (c) 2016 myyasuda
+    12. intgemm - Refer to 3rdparty/intgemm/LICENSE
+         Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev, 
Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation
 
 
     
=======================================================================================
diff --git a/Makefile b/Makefile
index 4ee71c9..aa41207 100644
--- a/Makefile
+++ b/Makefile
@@ -86,6 +86,25 @@ ifeq ($(USE_MKLDNN), 1)
        MKLDNNROOT = $(ROOTDIR)/3rdparty/mkldnn/build/install
 endif
 
+ifndef USE_INTGEMM
+ifeq ($(UNAME_P), x86_64)
+       COMPILER := $(shell $(CXX) --version |head -n 1 |cut -d " " -f 1)
+       COMPILER_VERSION := $(shell $(CXX) -dumpversion |cut -d . -f 1)
+       ifeq ($(COMPILER), clang)
+               USE_INTGEMM=1
+       endif
+       ifeq ($(COMPILER), Apple)
+               USE_INTGEMM=1
+       endif
+       # If it's not clang and not Apple clang, it's probably gcc and we need 
at least 5.
+       # gcc --version gives the name of the program it was called with, which 
makes it hard to detect.
+       COMPILER_VERSION_GE_5 := $(shell expr $(COMPILER_VERSION) \>= 5)
+       ifeq ($(COMPILER_VERSION_GE_5), 1)
+               USE_INTGEMM=1
+       endif
+endif
+endif
+
 include $(TPARTYDIR)/mshadow/make/mshadow.mk
 include $(DMLC_CORE)/make/dmlc.mk
 
@@ -463,6 +482,46 @@ endif
 all: lib/libmxnet.a lib/libmxnet.so $(BIN) extra-packages extension_libs
 
 SRC = $(wildcard src/*/*/*/*.cc src/*/*/*.cc src/*/*.cc src/*.cc)
+
+ifeq ($(USE_INTGEMM), 1)
+       ifndef INTGEMM_PATH
+               INTGEMM_PATH = build/3rdparty/intgemm
+       endif
+       CFLAGS += -DMXNET_USE_INTGEMM=1
+       LIB_DEP += $(INTGEMM_PATH)/libintgemm.a
+
+# Download intgemm if it isn't already
+$(INTGEMM_PATH):
+       @mkdir -p $(INTGEMM_PATH)
+       rm -rf $(INTGEMM_PATH)
+       git clone https://github.com/kpu/intgemm $(INTGEMM_PATH)
+       cd $(INTGEMM_PATH) && git checkout -q 
4172dcc209e6793dd920dec9cf9c9fc81605bd9d
+
+$(INTGEMM_PATH)/compile_test_avx512bw.cc: $(INTGEMM_PATH)
+       @
+$(INTGEMM_PATH)/compile_test_avx512vnni.cc: $(INTGEMM_PATH)
+       @
+$(INTGEMM_PATH)/intgemm/intgemm.cc: $(INTGEMM_PATH)
+       @
+
+# Compiler tests for AVX512BW and AVX512VNNI.
+$(INTGEMM_PATH)/intgemm/intgemm_config.h: 
$(INTGEMM_PATH)/compile_test_avx512bw.cc 
$(INTGEMM_PATH)/compile_test_avx512vnni.cc
+       echo '#pragma once' >$(INTGEMM_PATH)/intgemm/intgemm_config.h
+       $(CXX) $(CFLAGS) $(INTGEMM_PATH)/compile_test_avx512bw.cc 2>/dev/null 
&& echo \#define INTGEMM_COMPILER_SUPPORTS_AVX512BW 
>>$(INTGEMM_PATH)/intgemm/intgemm_config.h || echo Your compiler is missing 
AVX512BW support
+       $(CXX) $(CFLAGS) $(INTGEMM_PATH)/compile_test_avx512vnni.cc 2>/dev/null 
&& echo \#define INTGEMM_COMPILER_SUPPORTS_AVX512VNNI 
>>$(INTGEMM_PATH)/intgemm/intgemm_config.h || echo Your compiler is missing 
AVX512VNNI support
+
+$(INTGEMM_PATH)/intgemm/intgemm.o: $(INTGEMM_PATH)/intgemm/intgemm_config.h 
$(INTGEMM_PATH)/intgemm/intgemm.cc $(wildcard $(INTGEMM_PATH)/intgemm/*.h 
$(INTGEMM_PATH)/intgemm/*/*.h)
+       $(CXX) $(CFLAGS) -I$(INTGEMM_PATH) -std=c++11 -c 
$(INTGEMM_PATH)/intgemm/intgemm.cc -o $@
+
+$(INTGEMM_PATH)/libintgemm.a: $(INTGEMM_PATH)/intgemm/intgemm.o
+       @mkdir -p $(@D)
+       ar crv $@ $(filter %.o, $?)
+else
+       #If we're not using intgemm, remove the operators from src.
+       INTGEMM_OPS := $(wildcard src/operator/contrib/intgemm/*.cc)
+       SRC := $(filter-out $(INTGEMM_OPS),$(SRC))
+endif
+
 OBJ = $(patsubst %.cc, build/%.o, $(SRC))
 CUSRC = $(wildcard src/*/*/*/*.cu src/*/*/*.cu src/*/*.cu src/*.cu)
 CUOBJ = $(patsubst %.cu, build/%_gpu.o, $(CUSRC))
@@ -560,6 +619,13 @@ endif
 # For quick compile test, used smaller subset
 ALLX_DEP= $(ALL_DEP)
 
+ifeq ($(USE_INTGEMM), 1)
+# Enforce a dependency on $(INTGEMM_PATH)/intgemm/intgemm_config.h which is a 
generated header based on compiler support.
+build/src/operator/contrib/intgemm/%.o: src/operator/contrib/intgemm/%.cc 
$(INTGEMM_PATH)/intgemm/intgemm_config.h | mkldnn
+       @mkdir -p $(@D)
+       $(CXX) -std=c++11 -c $(CFLAGS) -MMD -I$(INTGEMM_PATH) -Isrc/operator -c 
$< -o $@
+endif
+
 build/src/%.o: src/%.cc | mkldnn
        @mkdir -p $(@D)
        $(CXX) -std=c++11 -c $(CFLAGS) -MMD -c $< -o $@
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 1980ca5..c2f4638 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -551,7 +551,7 @@ inline std::ostream& operator<<(std::ostream &out, const 
Context &ctx) {
 #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
 
 
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
 constexpr size_t kMKLDNNAlign = 64;
 #endif
 
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc 
b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
new file mode 100644
index 0000000..216f5ce
--- /dev/null
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -0,0 +1,328 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file intgemm_fully_connected_op.cc
+ * \brief Operator wrapping intgemm's Multiply routine
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <cstdlib>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct IntgemmFullyConnectedParam : public 
dmlc::Parameter<IntgemmFullyConnectedParam> {
+  int out_type;
+  int num_hidden;
+  bool no_bias;
+  bool flatten;
+  DMLC_DECLARE_PARAMETER(IntgemmFullyConnectedParam) {
+    // This part os a copy of the FullyConnected parameters.
+    DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
+    .describe("Number of hidden nodes of the output.");
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+    .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(flatten).set_default(true)
+    .describe("Whether to collapse all but the first axis of the input data 
tensor.");
+
+    DMLC_DECLARE_FIELD(out_type)
+    .add_enum("float32", mshadow::kFloat32)
+    .add_enum("int32", mshadow::kInt32)
+    .set_default(mshadow::kFloat32)
+    .describe("Output data type.");
+  }
+};
+DMLC_REGISTER_PARAMETER(IntgemmFullyConnectedParam);
+
+namespace {
+// Parse the above fields into indices for parameters.
+// The order is: data weight [scaling] [bias].
+struct ParameterIndices {
+  explicit ParameterIndices(const IntgemmFullyConnectedParam& param) :
+    data(0),
+    weight(1),
+    scaling(param.out_type == mshadow::kFloat32 ? 2 : kInvalid),
+    bias(param.no_bias ? kInvalid : (HaveScaling() ? 3 : 2)),
+    count(2U + HaveScaling() + HaveBias()) {}
+  bool HaveScaling() const { return scaling != kInvalid; }
+  bool HaveBias() const { return bias != kInvalid; }
+  const unsigned int data;
+  const unsigned int weight;
+  const unsigned int scaling;
+  const unsigned int bias;
+  const unsigned int count;
+  static const unsigned int kInvalid = std::numeric_limits<unsigned 
int>::max();
+};
+template<class T> ParameterIndices Sanity(const nnvm::NodeAttrs& attrs,
+                                          T* in,
+                                          T* out) {
+  // 3-4 parameters: A, B, scaling, and optional bias
+  ParameterIndices ret(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
+  CHECK_EQ(in->size(), ret.count);
+  CHECK_EQ(out->size(), 1U);
+  return ret;
+}
+}  // namespace
+
+inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_shape,
+                             mxnet::ShapeVector* out_shape) {
+  const ParameterIndices indices(Sanity(attrs, in_shape, out_shape));
+  const IntgemmFullyConnectedParam& param = 
nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+  // This follows FullyConnectedShape except for scaling.
+  using namespace mshadow;
+  mxnet::TShape dshape = (*in_shape)[indices.data];
+  mxnet::TShape oshape = (*out_shape)[0];
+  // require data to be known
+  if (!mxnet::ndim_is_known(dshape)) return false;
+
+  index_t num_input;
+  if (!param.flatten) {
+    num_input = dshape[dshape.ndim()-1];
+  } else {
+    num_input = dshape.ProdShape(1, dshape.ndim());
+  }
+  SHAPE_ASSIGN_CHECK(*in_shape, indices.weight, Shape2(param.num_hidden, 
num_input));
+  if (indices.HaveScaling()) {
+    SHAPE_ASSIGN_CHECK(*in_shape, indices.scaling, mxnet::TShape(1, 1));
+  }
+  if (indices.HaveBias()) {
+    if (!shape_assign(&(*in_shape)[indices.bias], Shape1(param.num_hidden)) &&
+        !shape_assign(&(*in_shape)[indices.bias], Shape2(param.num_hidden, 
1))) {
+      LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[indices.bias];
+    }
+  }
+
+  if (!param.flatten) {
+    mxnet::TShape result_shape(dshape);
+    result_shape[dshape.ndim()-1] = param.num_hidden;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
+  }
+  if (oshape.ndim() > 0) {
+    dshape[0] = oshape[0];
+    SHAPE_ASSIGN_CHECK(*in_shape, indices.data, dshape);
+  }
+  return true;
+}
+
+bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  const ParameterIndices indices(Sanity(attrs, in_attrs, out_attrs));
+  const IntgemmFullyConnectedParam& param = 
nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+
+  // Match the configuration for output.
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_type);
+  if (indices.HaveBias()) {
+    // Bias has same type as output.
+    TYPE_ASSIGN_CHECK(*in_attrs, indices.bias, (*out_attrs)[0]);
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[indices.bias]);
+  }
+  // Scaling is float32.
+  if (indices.HaveScaling()) {
+    TYPE_ASSIGN_CHECK(*in_attrs, indices.scaling, mshadow::kFloat32);
+  }
+  // Users have to prepare B. It wasn't intended to be efficient.
+  TYPE_ASSIGN_CHECK(*in_attrs, indices.weight, mshadow::kInt8);
+  // A can be a float (in which case it is automatically quantized) or int8.
+  if (type_is_none((*in_attrs)[indices.data])) {
+    return false;
+  }
+  return ((*in_attrs)[indices.data] == mshadow::kInt8 ||
+      (*in_attrs)[indices.data] == mshadow::kFloat32);
+}
+
+void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  const ParameterIndices indices(Sanity(attrs, &inputs, &outputs));
+  const IntgemmFullyConnectedParam& param = 
nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "TODO: doing more than overwriting for 
intgemm.";
+
+  const TBlob &A = inputs[indices.data], &B = inputs[indices.weight], &C = 
outputs[0];
+
+  CHECK(A.type_flag_ == mshadow::kInt8 || A.type_flag_ == mshadow::kFloat32);
+  CHECK_EQ(B.type_flag_, mshadow::kInt8);
+  CHECK(C.type_flag_ == mshadow::kInt32 || C.type_flag_ == mshadow::kFloat32);
+  CHECK(A.CheckContiguous());
+  CHECK(B.CheckContiguous());
+  CHECK(C.CheckContiguous());
+  CHECK_GE(A.shape_.ndim(), 1);
+  CHECK_GE(B.shape_.ndim(), 2);
+  size_t A_rows = A.shape_.ProdShape(0, A.shape_.ndim() - 1);
+  size_t inner = A.shape_[A.shape_.ndim() - 1];
+  CHECK_EQ(B.shape_[B.shape_.ndim() - 1], inner);
+  size_t B_cols = B.shape_.ProdShape(0, B.shape_.ndim() - 1);
+
+  CHECK_EQ(C.shape_.Size(), A_rows * B_cols);
+
+  bool bias = !param.no_bias;
+  if (bias) {
+    CHECK_EQ(inputs[indices.bias].type_flag_, C.type_flag_);
+    CHECK_EQ(inputs[indices.bias].shape_.Size(), param.num_hidden);
+  }
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << 
::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "intgemm requires B have a multiple of " << 
::intgemm::Int8::tile_info.b_cols <<
+    " columns in the equation C = AB.";
+
+  float out_float_multiplier;
+  if (indices.HaveScaling()) {
+    out_float_multiplier = *inputs[indices.scaling].dptr<float>();
+  } else {
+    out_float_multiplier = 0.0;  // Unused; stop compiler from complaining.
+  }
+
+  int8_t *A_quant;
+  mshadow::Tensor<cpu, 1, int8_t> A_quant_store;
+  if (A.type_flag_ == mshadow::kFloat32) {
+    const float *A_raw = A.dptr<float>();
+    // Quantize A for the user.
+    // Future: allow scale to be passed in? Should the induced scale be an 
output?
+    float scale = 127.0 / ::intgemm::MaxAbsolute(A_raw, A_raw + 
A.shape_.Size());
+    out_float_multiplier /= scale;
+    A_quant_store = ctx.requested[0].get_space_typed<cpu, 1, int8_t>(
+        mshadow::Shape1(A.shape_.Size()),
+        ctx.get_stream<cpu>());
+    A_quant = A_quant_store.dptr_;
+    ::intgemm::Int8::PrepareA(A_raw, A_quant, scale, A_rows, inner);
+  } else {
+    CHECK_EQ(A.type_flag_, mshadow::kInt8);
+    A_quant = A.dptr<int8_t>();
+  }
+  const int8_t *B_quant = B.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(A_quant) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
+  CHECK_EQ(reinterpret_cast<intptr_t>(B_quant) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
+  if (C.type_flag_ == mshadow::kFloat32) {
+    CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<float>()) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+  } else {
+    CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<int32_t>()) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+  }
+
+  if (bias) {
+    if (C.type_flag_ == mshadow::kFloat32) {
+      CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<float>()) 
% 64, 0) <<
+        "Pointers should be aligned to a multiple of 64.";
+      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(
+          out_float_multiplier,
+          inputs[indices.bias].dptr<float>(),
+          C.dptr<float>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    } else {
+      // int32
+      
CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<int32_t>()) % 64, 
0) <<
+        "Pointers should be aligned to a multiple of 64.";
+      ::intgemm::callbacks::AddBiasAndWrite cb(
+          inputs[indices.bias].dptr<int32_t>(),
+          C.dptr<int32_t>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    }
+  } else {
+    if (C.type_flag_ == mshadow::kFloat32) {
+      ::intgemm::callbacks::UnquantizeAndWrite cb(out_float_multiplier, 
C.dptr<float>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    } else {
+      // int32
+      ::intgemm::callbacks::Write<int32_t> cb(C.dptr<int32_t>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    }
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_fully_connected)
+.add_alias("_npx_intgemm_fully_connected")
+.describe(R"code(Multiply matrices using 8-bit integers.  data * weight.
+
+Input tensor arguments are: data weight [scaling] [bias]
+
+data: either float32 or prepared using intgemm_prepare_data (in which case it 
is int8).
+
+weight: must be prepared using intgemm_prepare_weight.
+
+scaling: present if and only if out_type is float32. If so this is multiplied 
by the result before adding bias. Typically:
+scaling = (max passed to intgemm_prepare_weight)/127.0 if data is in float32
+scaling = (max_passed to intgemm_prepare_data)/127.0 * (max passed to 
intgemm_prepare_weight)/127.0 if data is in int8
+
+bias: present if and only if !no_bias. This is added to the output after 
scaling and has the same number of columns as the output.
+
+out_type: type of the output.
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<IntgemmFullyConnectedParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  return 
ParameterIndices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed)).count;
+})
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    std::vector<std::string> ret{"data", "weight"};
+    ParameterIndices 
indices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
+    if (indices.HaveScaling()) {
+      ret.emplace_back("scaling");
+    }
+    if (indices.HaveBias()) {
+      ret.emplace_back("bias");
+    }
+    return ret;
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
+.set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
+.set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)
+.add_argument(
+    "data",
+    "NDArray-or-Symbol",
+    "First argument to multiplication. Tensor of float32 (quantized on the 
fly) or int8 from "
+      "intgemm_prepare_data. If you use a different quantizer, be sure to ban 
-128. The last "
+      "dimension must be a multiple of 64.")
+.add_argument(
+    "weight",
+    "NDArray-or-Symbol",
+    "Second argument to multiplication. Tensor of int8 from 
intgemm_prepare_weight. The last "
+      "dimension must be a multiple of 64.  The product of non-last dimensions 
must be a multiple "
+      "of 8.")
+.add_argument("scaling", "NDArray-or-Symbol", "Scaling factor to apply if 
output type is float32.")
+.add_argument("bias", "NDArray-or-Symbol", "Bias term.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_arguments(IntgemmFullyConnectedParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc 
b/src/operator/contrib/intgemm/max_absolute_op.cc
new file mode 100644
index 0000000..01e10b0
--- /dev/null
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file max_absolute_op.cc
+ * \brief Computes maximum absolute value of a tensor using intgemm
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool MaxAbsoluteOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_attrs,
+                             mxnet::ShapeVector* out_attrs) {
+  // One in, one out.
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
+  return shape_is_known(in_attrs->at(0));
+}
+
+inline bool MaxAbsoluteOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  return true;
+}
+
+inline bool MaxAbsoluteOpStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const TBlob &in = inputs.front(), &out = outputs.front();
+  CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+  CHECK_EQ(out.type_flag_, mshadow::kFloat32);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+
+  const std::size_t size = in.shape_.Size();
+
+  const float *data = in.dptr<float>();
+  // To maintain alignment, be a multiple of AVX512 register size.
+  const std::size_t kMultiple = 512 / 8;
+  CHECK_EQ(reinterpret_cast<intptr_t>(data) % kMultiple, 0)
+    << "Data must be aligned to " << kMultiple << " bytes.";
+
+  float result = ::intgemm::MaxAbsolute(data, data + size);
+  KERNEL_ASSIGN(*out.dptr<float>(), req[0], result);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute)
+.add_alias("_npx_intgemm_maxabsolute")
+.describe(R"code(Compute the maximum absolute value in a tensor of float32 
fast on a CPU.  The tensor's total size must be a multiple of 16 and aligned to 
a multiple of 64 bytes.
+mxnet.nd.contrib.intgemm_maxabsolute(arr) == arr.abs().max()
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", MaxAbsoluteOpShape)
+.set_attr<nnvm::FInferType>("FInferType", MaxAbsoluteOpType)
+.set_attr<FInferStorageType>("FInferStorageType", MaxAbsoluteOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", MaxAbsoluteOpForwardCPU)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "Tensor to compute maximum absolute 
value of");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc 
b/src/operator/contrib/intgemm/prepare_data_op.cc
new file mode 100644
index 0000000..1d5719d
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_data_op.cc
+ * \brief Converts data aka A matrices (typically activations) to intgemm's
+ * representation for A in C=AB.  This just quantizes to int8 and bans -128.
+ * The only difference from Quantize/QuantizeV2 is that it bans -128.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+bool PrepareDataOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs) {
+  // data and maximum
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+
+  return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareDataOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  // This routine converts from float to int8 with a scaling factor
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  return true;
+}
+
+bool PrepareDataOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  return true;
+}
+
+void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+  const TBlob &in = inputs[0], &out = outputs[0];
+
+  CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+
+  const float *A = in.dptr<float>();
+  int8_t *quantA = out.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(A) % 64, 0);
+  CHECK_EQ(reinterpret_cast<intptr_t>(quantA) % 64, 0);
+  const float multiplier = 127.0 / *inputs[1].dptr<float>();
+  ::intgemm::Int8::Quantize(A, quantA, multiplier, in.shape_.Size());
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_data)
+.add_alias("_npx_intgemm_prepare_data")
+.describe(R"code(This operator converts quantizes float32 to int8 while also 
banning -128.
+
+It it suitable for preparing an data matrix for use by intgemm's C=data * 
weights operation.
+
+The float32 values are scaled such that maxabs maps to 127. Typically maxabs = 
maxabsolute(A).
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "maxabs"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareDataOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareDataOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareDataOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", PrepareDataOpForwardCPU)
+.add_argument("data", "NDArray-or-Symbol", "Activation matrix to be prepared 
for multiplication.")
+.add_argument(
+    "maxabs",
+    "NDArray-or-Symbol",
+    "Maximum absolute value to be used for scaling.  (The values will be 
multiplied by 127.0 / "
+      "maxabs.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc 
b/src/operator/contrib/intgemm/prepare_weight_op.cc
new file mode 100644
index 0000000..ad106eb
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_weight_op.cc
+ * \brief Converts weight matrices to intgemm's representation.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct PrepareWeightParam : public dmlc::Parameter<PrepareWeightParam> {
+  bool already_quantized;
+  DMLC_DECLARE_PARAMETER(PrepareWeightParam) {
+    DMLC_DECLARE_FIELD(already_quantized).set_default(false)
+    .describe("Is the weight matrix already quantized?");
+  }
+};
+DMLC_REGISTER_PARAMETER(PrepareWeightParam);
+
+bool PrepareWeightOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs) {
+  // Optimal maximum parameter.
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  if (in_attrs->size() == 2U) {
+    SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+  }
+  return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareWeightOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs) {
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  if (in_attrs->size() == 1U) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+  } else if (in_attrs->size() == 2U) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  }
+  return true;
+}
+
+bool PrepareWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  CHECK_EQ(out_attrs->size(), 1U);
+  STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+  if (in_attrs->size() == 2U) {
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+  }
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  return true;
+}
+
+void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  const PrepareWeightParam& params = 
nnvm::get<PrepareWeightParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), params.already_quantized ? 1U : 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+
+  const TBlob &in = inputs.front();
+  const TBlob &out = outputs.front();
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1);
+  size_t inner = in.shape_[in.shape_.ndim() - 1];
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << 
::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "intgemm requires the output dimension (the product of all but the last 
dimension of the "
+    "weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols 
<< ".";
+
+  int8_t *quantB = out.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(quantB) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
+  CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8) 
<<
+    "Expected either 32-bit values to be quantized or 8-bit values to 
rearrange.";
+  if (in.type_flag_ == mshadow::kInt8) {
+    const int8_t *B = in.dptr<int8_t>();
+    CHECK_EQ(reinterpret_cast<intptr_t>(B) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+    ::intgemm::Int8::PrepareBQuantizedTransposed(B, quantB, inner, B_cols);
+  } else if (in.type_flag_ == mshadow::kFloat32) {
+    const float *B = in.dptr<float>();
+    CHECK_EQ(reinterpret_cast<intptr_t>(B) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+    ::intgemm::Int8::PrepareBTransposed(
+        B,
+        quantB,
+        127.0 / *inputs[1].dptr<float>(),
+        inner,
+        B_cols);
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_weight)
+.add_alias("_npx_intgemm_prepare_weight")
+.describe(R"code(This operator converts a weight matrix in column-major format 
to intgemm's internal fast representation of weight matrices.  MXNet 
customarily stores weight matrices in column-major (transposed) format. This 
operator is not meant to be fast; it is meant to be run offline to quantize a 
model.
+
+In other words, it prepares weight for the operation C = data * weight^T.
+
+If the provided weight matrix is float32, it will be quantized first.  The 
quantization function is (int8_t)(127.0 / max * weight) where multiplier is 
provided as argument 1 (the weight matrix is argument 0).  Then the matrix will 
be rearranged into the CPU-dependent format.
+
+If the provided weight matrix is already int8, the matrix will only be 
rearranged into the CPU-dependent format.  This way one can quantize with 
intgemm_prepare_data (which just quantizes), store to disk in a consistent 
format, then at load time convert to CPU-dependent format with 
intgemm_prepare_weight.
+
+The internal representation depends on register length.  So AVX512, AVX2, and 
SSSE3 have different formats.  AVX512BW and AVX512VNNI have the same 
representation.
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<PrepareWeightParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const PrepareWeightParam& params = 
nnvm::get<PrepareWeightParam>(attrs.parsed);
+  return params.already_quantized ? 1 : 2;
+})
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) 
{
+  const PrepareWeightParam& params = 
nnvm::get<PrepareWeightParam>(attrs.parsed);
+  return params.already_quantized ?
+    std::vector<std::string>{"weight"} : std::vector<std::string>{"weight", 
"maxabs"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareWeightOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareWeightOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareWeightOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", PrepareWeightOpForwardCPU)
+.add_argument("weight", "NDArray-or-Symbol", "Parameter matrix to be prepared 
for multiplication.")
+.add_argument(
+    "maxabs",
+    "NDArray-or-Symbol",
+    "Maximum absolute value for scaling. The weights will be multipled by 
127.0 / maxabs.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_arguments(PrepareWeightParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc 
b/src/operator/contrib/intgemm/take_weight_op.cc
new file mode 100644
index 0000000..09e320e
--- /dev/null
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file select_weight_op.cc
+ * \brief Takes from the all-but-last dimension of a tensor stored in
+ * intgemm's weight format.  This is particularly useful for output matrices 
where
+ * some outputs are excluded.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool TakeWeightOpShape(const nnvm::NodeAttrs& shape,
+                             mxnet::ShapeVector* in_shape,
+                             mxnet::ShapeVector* out_shape) {
+  // 0 is weight, 1 is indices.
+  CHECK_EQ(in_shape->size(), 2U);
+  CHECK_EQ(out_shape->size(), 1U);
+
+  mxnet::TShape &weight = (*in_shape)[0];
+  mxnet::TShape &indices = (*in_shape)[1];
+  mxnet::TShape &out = (*out_shape)[0];
+
+  // weight matrices should be 2-dimensional by now.
+  SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape(2, -1));
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape(2, -1));
+  // indices are 1-dimensional.
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape(1, -1));
+
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape({indices[0], weight[1]}));
+  SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape({-1, out[1]}));
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape({out[0]}));
+
+  return shape_is_known(weight) && shape_is_known(indices) && 
shape_is_known(out);
+}
+
+inline bool TakeWeightOpType(const nnvm::NodeAttrs& attrs,
+                             std::vector<int>* in_attrs,
+                             std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt32);
+  return true;
+}
+
+inline bool TakeWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+                                    const int dev_mask,
+                                    DispatchMode* dispatch_mode,
+                                    std::vector<int>* in_attrs,
+                                    std::vector<int>* out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "TODO request types other than write";
+  const TBlob &weight = inputs.front(), &indices = inputs[1], &out = 
outputs.front();
+  CHECK_EQ(weight.type_flag_, mshadow::kInt8);
+  CHECK_EQ(indices.type_flag_, mshadow::kInt32);
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(weight.CheckContiguous());
+  CHECK(indices.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  size_t B_cols = indices.shape_[0];
+  size_t inner = weight.shape_[weight.shape_.ndim() - 1];
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << 
::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "For efficiency, intgemm requires there to be a multiple of " <<
+    ::intgemm::Int8::tile_info.b_cols << " indices.";
+  // mxnet doesn't have a uint32_t type so we'll just pointer cast. But check 
the sizes are the
+  // same.  Ideally this should be static.
+  assert(sizeof(int32_t) == sizeof(::intgemm::Index));
+  const ::intgemm::Index *index =
+    reinterpret_cast<const ::intgemm::Index*>(indices.dptr<int32_t>());
+
+  ::intgemm::Int8::SelectColumnsB(
+      weight.dptr<int8_t>(),
+      out.dptr<int8_t>(),
+      inner,
+      index,
+      index + B_cols);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_take_weight)
+.add_alias("_npx_intgemm_take_weight")
+.describe(R"code(Index a weight matrix stored in intgemm's weight format.
+The indices select the outputs of matrix multiplication, not the inner dot 
product dimension.
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"weight", "indices"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", TakeWeightOpShape)
+.set_attr<nnvm::FInferType>("FInferType", TakeWeightOpType)
+.set_attr<FInferStorageType>("FInferStorageType", TakeWeightOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", TakeWeightOpForwardCPU)
+.add_argument(
+    "weight",
+    "NDArray-or-Symbol",
+    "Tensor already in intgemm weight format to select from")
+.add_argument("indices", "NDArray-or-Symbol", "indices to select on the 0th 
dimension of weight");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index f6b296a..50cc4ab 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -53,7 +53,7 @@ class CPUDeviceStorage {
   /*!
    * \brief Alignment of allocation.
    */
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
   // MKLDNN requires special alignment. 64 is used by the MKLDNN library in
   // memory allocation.
   static constexpr size_t alignment_ = kMKLDNNAlign;
diff --git a/tests/python/unittest/test_contrib_intgemm.py 
b/tests/python/unittest/test_contrib_intgemm.py
new file mode 100644
index 0000000..69fa5e0
--- /dev/null
+++ b/tests/python/unittest/test_contrib_intgemm.py
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import np, npx
+from mxnet.test_utils import same, use_np, assert_almost_equal
+from common import with_seed
+import random
+from itertools import product
+
+
+# with_seed() from MXNet 1.x breaks @pytest.mark.parametrize so all randomized
+# tests use a for loop over a Cartesian product of parameters.
+
+@use_np
+@with_seed()
+def test_contrib_intgemm_maxabsolute():
+    if "intgemm_maxabsolute" not in dir(mx.nd.contrib):
+        return
+    for shape in ([(3, 2), (9,17), (2, 7, 1, 8)] + [(i,) for i in 
range(1,65)]):
+        # mx.nd API
+        m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape)
+        fast = mx.nd.contrib.intgemm_maxabsolute(m)
+        slow = mx.nd.max(mx.nd.abs(m))
+        assert same(fast, slow)
+        # np API
+        m = np.random.uniform(low=-100.0, high=100.0, size=shape)
+        fast = npx.intgemm_maxabsolute(m).reshape(())
+        slow = np.max(np.abs(m))
+        assert same(fast, slow)
+    
+@use_np
+@with_seed()
+def test_contrib_intgemm_prepare_data():
+    if "intgemm_prepare_data" not in dir(mx.nd.contrib):
+        return
+    for shape, max_quant in product([(i,) for i in range(1, 67)] + [(2,3), 
(130, 12)], [2.0, 2.5]):
+        m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
+        scaled = m * 127.0 / max_quant
+        # Rounding 0.5 can go up or down.  Move values away from 0.5.
+        too_close = mx.nd.abs(mx.nd.round(scaled) - scaled) > 0.45
+        m += max_quant / 127.0 * 0.05 * too_close
+    
+        # Reference: scale and round
+        ref = mx.nd.round(m * 127.0 / max_quant)
+        # Clip to [-127, 127].  Because otherwise e.g. -129 casts to +127.
+        ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127.0]))
+        ref = mx.nd.broadcast_minimum(ref, mx.nd.array([127.0]))
+        # Reference: cast to int8
+        ref = mx.nd.cast(ref, dtype='int8')
+        # Reference: ban -128
+        ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127], dtype = 'int8'))
+    
+        test = mx.nd.contrib.intgemm_prepare_data(m, mx.nd.array([max_quant]))
+        assert same(test, ref)
+        test = npx.intgemm_prepare_data(m.as_np_ndarray(), 
np.array([max_quant]))
+        assert same(test, ref.as_np_ndarray())
+    
+@use_np
+@with_seed()
+def test_contrib_intgemm_weight_consistent():
+    # The weight format is actually CPU-dependent so we don't directly test the
+    # output, but indirectly test that it works.
+    if "intgemm_prepare_weight" not in dir(mx.nd.contrib):
+        return
+    for shape, max_quant, api in product(
+            [(8, 64), (16, 64), (8, 128), (16, 128), (2, 4, 64)],
+            [0.2, 3.0],
+            [(mx.nd.contrib, mx.nd), (npx, np)]):
+        contrib, top = api
+        max_array = top.array([max_quant])
+        if top == mx.nd:
+            m = top.random_uniform(low=-3.0, high=3.0, shape=shape)
+        else:
+            m = np.random.uniform(size=shape)
+        direct = contrib.intgemm_prepare_weight(m, max_array)
+        quant = contrib.intgemm_prepare_data(m, max_array) 
+        indirect = contrib.intgemm_prepare_weight(quant, 
already_quantized=True)
+        # Should get the same data from direct call and already_quantized 
version.
+        assert same(direct, indirect)
+    
+@use_np
+@with_seed()
+def test_contrib_intgemm_take_weight():
+    if "intgemm_take_weight" not in dir(mx.nd.contrib):
+        return
+    test_indices = [
+        [0,1,2,3,4,5,6,7],
+        [1,2,1,2,1,2,1,2],
+        [7,6,5,4,3,2,1,0],
+        [3,1,4,1,5,9,2,6],
+        # Since random_uniform doesn't support int8, use python
+        [random.randint(0,15) for i in range(8)],
+        [random.randint(0,15) for i in range(16)],
+        [random.randint(0,15) for i in range(24)]
+    ]
+    for indices, api in product(test_indices, [(mx.nd.contrib, mx.nd), (npx, 
np)]):
+        contrib, top = api
+        m = top.array([random.randint(-127,127) for i in range(16 * 64)], 
dtype='int8')
+        m = m.reshape((16, 64))
+        indices = top.array(indices, dtype='int32')
+        # Prepare weight then take.
+        test = contrib.intgemm_prepare_weight(m, already_quantized=True)
+        test = contrib.intgemm_take_weight(test, indices)
+        # Take then prepare.
+        ref = m.take(indices, axis=0)
+        ref = contrib.intgemm_prepare_weight(ref, already_quantized=True)
+        assert same(test, ref)
+    
+@use_np
+def test_contrib_intgemm_multiply():
+    if "intgemm_fully_connected" not in dir(mx.nd.contrib):
+        return
+    apis = [(mx.nd.contrib, mx.nd, mx.nd.FullyConnected, mx.nd.cast), (npx, 
np, npx.fully_connected, npx.cast)]
+    for data_rows, inner, weight_cols, api in product(range(1, 5),
+                                                      range(64, 256, 64),
+                                                      range(8, 24, 8),
+                                                      apis):
+        contrib, top, fully_connected, cast = api
+        #The multiplication routine has approximations so everything is tested
+        #deterministically to ensure bounds are met.
+        random.seed(1)
+    
+        # Don't use full range (-127, 127) to avoid saturation.
+        data = [random.randint(-64, 64) for i in range(data_rows * inner)]
+        data = top.array(data, dtype='int8').reshape((data_rows, inner))
+        weight = [random.randint(-64, 64) for i in range(inner * weight_cols)]
+        weight = top.array(weight, dtype='int8').reshape((weight_cols, inner))
+        weight_prepared = contrib.intgemm_prepare_weight(weight, 
already_quantized=True)
+    
+        # int32 output, no bias
+        test = contrib.intgemm_fully_connected(data,
+                                               weight_prepared,
+                                               no_bias=True,
+                                               flatten=False,
+                                               out_type='int32',
+                                               num_hidden=weight_cols)
+        ref = fully_connected(cast(data, dtype='float32'),
+                              cast(weight, dtype='float32'),
+                              no_bias=True,
+                              flatten=False,
+                              num_hidden=weight_cols)
+        assert_almost_equal(cast(test, dtype='float32').as_nd_ndarray(), 
ref.as_nd_ndarray(), rtol=0.01, atol=0.01)
+    
+        # float32 output, no bias
+        scale = 3.0
+        test = contrib.intgemm_fully_connected(data,
+                                               weight_prepared,
+                                               top.array([scale]),
+                                               no_bias=True,
+                                               flatten=False,
+                                               out_type='float32',
+                                               num_hidden=weight_cols)
+        assert_almost_equal(test.as_nd_ndarray(), (ref * 
scale).as_nd_ndarray(), rtol=0.01, atol=0.01)
+    
+        # int32 output, bias
+        bias = top.array([random.randint(-60000, 60000) for i in 
range(weight_cols)], dtype = 'int32')
+        test = contrib.intgemm_fully_connected(data,
+                                               weight_prepared,
+                                               bias,
+                                               no_bias=False,
+                                               flatten=False,
+                                               out_type='int32',
+                                               num_hidden=weight_cols)
+        ref = fully_connected(cast(data, dtype='float32'),
+                                   cast(weight, dtype='float32'),
+                                   cast(bias, dtype='float32'),
+                                   no_bias=False,
+                                   flatten=False,
+                                   num_hidden=weight_cols)
+        assert_almost_equal(cast(test, dtype='float32').as_nd_ndarray(), 
ref.as_nd_ndarray(), rtol=0.01, atol=0.01)
+    
+        # float32 output, bias
+        # Scaling is applied before bias (and bias is not scaled). So to make 
the
+        # reference comparison easy, just scale the bias beforehand.
+        test = contrib.intgemm_fully_connected(data,
+                                               weight_prepared,
+                                               top.array([scale]),
+                                               cast(bias, dtype='float32') * 
scale,
+                                               no_bias=False,
+                                               flatten=False,
+                                               out_type='float32',
+                                               num_hidden=weight_cols)
+        assert_almost_equal(test.as_nd_ndarray(), (ref * 
scale).as_nd_ndarray(), rtol=0.01, atol=0.01)
+    
+        # float32 input should work the same as manually prepared int8 input.
+        data_float = top.array([random.uniform(-3.14, 3.14) for i in 
range(data_rows * inner)])
+        data_float = data_float.reshape(data_rows, inner)
+        direct = contrib.intgemm_fully_connected(data_float,
+                                                 weight_prepared,
+                                                 top.array([scale]),
+                                                 cast(bias, dtype='float32'),
+                                                 no_bias=False,
+                                                 flatten=False,
+                                                 out_type='float32',
+                                                 num_hidden=weight_cols)
+        maxabs = contrib.intgemm_maxabsolute(data_float)
+        data_prepared = contrib.intgemm_prepare_data(data_float, maxabs)
+        cooked = contrib.intgemm_fully_connected(data_prepared,
+                                                 weight_prepared,
+                                                 top.array(scale * maxabs / 
127.0),
+                                                 cast(bias, dtype='float32'),
+                                                 no_bias=False,
+                                                 flatten=False,
+                                                 out_type='float32',
+                                                 num_hidden=weight_cols)
+        assert_almost_equal(direct.as_nd_ndarray(), cooked.as_nd_ndarray(), 
rtol=0.01, atol=0.01)

[incubator-mxnet] branch v1.x updated: [1.x] Backport of intgemm #17559 (#19099)

Reply via email to