[GitHub] [tvm] cconvey commented on a diff in pull request #12204: [Hexagon] Add Hand written HVX conv2d

GitBox Thu, 18 Aug 2022 09:30:53 -0700


cconvey commented on code in PR #12204:
URL: https://github.com/apache/tvm/pull/12204#discussion_r949262003



##########
include/tvm/runtime/hexagon/ops/conv2d.h:
##########
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = 
DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the 
shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const 
int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : 
data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;
+  }
+
+  void* data_space = nullptr;
+  int64_t dims[NDIM];
+};
+
+inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); 
}
+
+constexpr int xyc_to_sm_16b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxcccccx
+  assert(y >= 0 && x >= 0 && c >= 0);
+  return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
+}
+
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 16-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiiioooooi, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
+}
+
+inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+
+inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) {
+  if (y < 0 || y >= a.shape[1]) return uintptr_t(0);
+  auto p = static_cast<uintptr_t*>(a.data);
+  assert(n == 0);
+  return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c];
+}
+
+inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
+  auto p = static_cast<uintptr_t*>(f.data);
+  return p[y * f.shape[1] * f.shape[2] * f.shape[3] + x * f.shape[2] * 
f.shape[3] + i * f.shape[3] +
+           o];
+}

Review Comment:
   Would [Horner's algorithm](https://en.wikipedia.org/wiki/Horner%27s_method) 
be a good technique for computing `p`'s index?  It's potentially faster and 
potentially easier to read.



##########
include/tvm/runtime/hexagon/ops/conv2d.h:
##########
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = 
DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the 
shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const 
int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : 
data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;
+  }
+
+  void* data_space = nullptr;
+  int64_t dims[NDIM];
+};
+
+inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); 
}
+
+constexpr int xyc_to_sm_16b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxcccccx
+  assert(y >= 0 && x >= 0 && c >= 0);
+  return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
+}
+
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 16-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiiioooooi, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
+}
+
+inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+
+inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) {
+  if (y < 0 || y >= a.shape[1]) return uintptr_t(0);
+  auto p = static_cast<uintptr_t*>(a.data);
+  assert(n == 0);
+  return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c];
+}
+
+inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
+  auto p = static_cast<uintptr_t*>(f.data);
+  return p[y * f.shape[1] * f.shape[2] * f.shape[3] + x * f.shape[2] * 
f.shape[3] + i * f.shape[3] +
+           o];
+}
+
+inline uint32_t* bias_at(const DLTensor& b, int d) {
+  auto p = static_cast<uint32_t*>(b.data);
+  return p + d;
+}

Review Comment:
   This function doesn't appear to be used anywhere in the PR.  Is that 
intentional?  
   
   Usually for TVM PRs, we try to ensure thorough test coverage of the new code 
([link](https://tvm.apache.org/docs/contribute/code_review.html#factors-to-consider-about-code-quality)).
   
   [A code-comment 
below](https://github.com/apache/tvm/pull/12204/files#diff-1a3e6ccd9b3b65439dfe0e4ce36f0b04e186203504a2250b6cc3c7ce1172513cR35)
 states that bias isn't currently accepted.  Given that, would it make sense to 
just remove this function from the PR?



##########
include/tvm/runtime/hexagon/ops/conv2d.h:
##########
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = 
DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the 
shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const 
int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : 
data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;
+  }
+
+  void* data_space = nullptr;
+  int64_t dims[NDIM];
+};
+
+inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); 
}
+
+constexpr int xyc_to_sm_16b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxcccccx
+  assert(y >= 0 && x >= 0 && c >= 0);
+  return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
+}
+
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 16-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiiioooooi, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
+}
+
+inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+
+inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) {
+  if (y < 0 || y >= a.shape[1]) return uintptr_t(0);
+  auto p = static_cast<uintptr_t*>(a.data);
+  assert(n == 0);
+  return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c];
+}
+
+inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
+  auto p = static_cast<uintptr_t*>(f.data);
+  return p[y * f.shape[1] * f.shape[2] * f.shape[3] + x * f.shape[2] * 
f.shape[3] + i * f.shape[3] +
+           o];
+}
+
+inline uint32_t* bias_at(const DLTensor& b, int d) {
+  auto p = static_cast<uint32_t*>(b.data);

Review Comment:
   Is there a reasonable way to ensure that this `static_cast` is actually 
valid for `b`?



##########
include/tvm/runtime/hexagon/ops/conv2d.h:
##########
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = 
DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the 
shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const 
int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : 
data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;
+  }
+
+  void* data_space = nullptr;
+  int64_t dims[NDIM];
+};
+
+inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); 
}
+
+constexpr int xyc_to_sm_16b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxcccccx
+  assert(y >= 0 && x >= 0 && c >= 0);
+  return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
+}
+
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 16-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiiioooooi, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
+}
+
+inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+
+inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) {
+  if (y < 0 || y >= a.shape[1]) return uintptr_t(0);
+  auto p = static_cast<uintptr_t*>(a.data);
+  assert(n == 0);
+  return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c];
+}
+
+inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
+  auto p = static_cast<uintptr_t*>(f.data);

Review Comment:
   Is there a reasonable way to ensure that these `static_cast` is actually 
valid for `a` and `f`?



##########
include/tvm/runtime/hexagon/ops/conv2d.h:
##########
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = 
DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the 
shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const 
int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : 
data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;
+  }
+
+  void* data_space = nullptr;
+  int64_t dims[NDIM];
+};
+
+inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); 
}
+
+constexpr int xyc_to_sm_16b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxcccccx
+  assert(y >= 0 && x >= 0 && c >= 0);
+  return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
+}
+
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 16-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiiioooooi, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
+}
+
+inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+
+inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) {
+  if (y < 0 || y >= a.shape[1]) return uintptr_t(0);
+  auto p = static_cast<uintptr_t*>(a.data);
+  assert(n == 0);
+  return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c];
+}
+
+inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
+  auto p = static_cast<uintptr_t*>(f.data);
+  return p[y * f.shape[1] * f.shape[2] * f.shape[3] + x * f.shape[2] * 
f.shape[3] + i * f.shape[3] +
+           o];
+}

Review Comment:
   The `DLTensor` struct has two members that don't seem to be considered here: 
[`strides` and 
`byte_offset`](https://github.com/dmlc/dlpack/blob/ddeb264880a1fa7e7be238ab3901a810324fbe5f/include/dlpack/dlpack.h#L159-L161).
   
   Is it truly safe to ignore those members here?  If so, it might be helpful 
to (ideally) add an `assert` and/or a comment about why they can be ignored.
   
   If the reason we can safely ignore `strides` and `byte_offset` is that we 
know they were set by `SDLTensor`'s constructor, then would it make sense to 
have these two functions explicitly operate on `SDLTensor` objects rather than 
`DLTensor` objects?  (And if so, then any reason they shouldn't be class 
members rather than free-standing functions?)



##########
src/runtime/hexagon/ops/conv2d_hvx.cc:
##########
@@ -0,0 +1,473 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <HAP_compute_res.h>
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+
+#include "tvm/runtime/hexagon/ops/conv2d.h"
+
+// Current limitations:
+// - N in NHWC must be 1
+// - dilated convolutions are not supported
+// - Bias is not accepted
+// - Optional "relu" is not performed
+
+// Packed arguments:
+//   0: DLTensor activations (NHWC)
+//   1: DLTensor weights (HWIO)
+//   2: int offset_top
+//   3: int offset_left
+//   4: int stride_h
+//   5: int stride_w
+//   6: DLTensor output (NHWC)
+extern "C" int conv2d_packed(TVMValue* args, int* type_codes, int num_args, 
TVMValue* out_val,
+                             int out_code, void* res_handle);
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int 
block_out_c, int yi, int xio,
+                               int ci, int xii, const DLTensor& block) {
+  auto block_ptr = nhwc_at(block, 0, block_out_y, block_out_x, block_out_c);
+  auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii;
+  auto first_element_ptr = reinterpret_cast<uint16_t*>(block_ptr);
+  return first_element_ptr + block_offset;
+}

Review Comment:
   This function's name and argument list suggest that it handles tensors with 
a variety of element dtypes.  But its body and return type assume `uint16_t`.  
   
   Is there some reason that's a save assumption in this context?
   
   If yes, then it might be helpful to add some comments explaining why.  If 
not, then I'd suggest some form of disambiguation, e.g.:
   - rename this to `getElementPtr_u16`, or
   - convert this to a template that's parameterized by dtype.
   
   Even if this `uint16_t` is being used as a stand-in for qfloat16 or IEEE-754 
half-precision floats, it's not (immediately) obvious that this code will only 
ever be used for _16-bit_ HVX values.



##########
include/tvm/runtime/hexagon/ops/conv2d.h:
##########
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = 
DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the 
shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const 
int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : 
data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;
+  }
+
+  void* data_space = nullptr;
+  int64_t dims[NDIM];
+};
+
+inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); 
}
+
+constexpr int xyc_to_sm_16b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxcccccx
+  assert(y >= 0 && x >= 0 && c >= 0);
+  return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
+}
+
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 16-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiiioooooi, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
+}
+
+inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+
+inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) {
+  if (y < 0 || y >= a.shape[1]) return uintptr_t(0);
+  auto p = static_cast<uintptr_t*>(a.data);
+  assert(n == 0);
+  return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c];
+}
+
+inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
+  auto p = static_cast<uintptr_t*>(f.data);
+  return p[y * f.shape[1] * f.shape[2] * f.shape[3] + x * f.shape[2] * 
f.shape[3] + i * f.shape[3] +
+           o];
+}

Review Comment:
   These functions seem to assume that `a.ndim >= 4`.  Would it make sense to 
ensure that's true, by e.g.:
   - `assert(a.n >= 4)` or
   - converting these to template functions, and replace the first argument 
with `const SDLTensor<4> & a`?



##########
include/tvm/runtime/hexagon/ops/conv2d.h:
##########
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = 
DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the 
shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const 
int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : 
data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;

Review Comment:
   The roles of `data_ptr` and `data_space` are explained in [this PR 
comment](https://github.com/apache/tvm/pull/12204#discussion_r945441218), but I 
don't see that same information in the current code.
   
   Could we add put that same information into one or more code comments?
   
   Also, even if that PR comment is right about there being no risk of dangling 
pointers, that isn't _obviously_ true just from normal-level-of-effort reading 
of the code.  Is there some way we can address that?



##########
src/runtime/hexagon/ops/conv2d_hvx.cc:
##########
@@ -0,0 +1,473 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <HAP_compute_res.h>
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+
+#include "tvm/runtime/hexagon/ops/conv2d.h"
+
+// Current limitations:
+// - N in NHWC must be 1
+// - dilated convolutions are not supported
+// - Bias is not accepted
+// - Optional "relu" is not performed
+
+// Packed arguments:
+//   0: DLTensor activations (NHWC)
+//   1: DLTensor weights (HWIO)
+//   2: int offset_top
+//   3: int offset_left
+//   4: int stride_h
+//   5: int stride_w
+//   6: DLTensor output (NHWC)
+extern "C" int conv2d_packed(TVMValue* args, int* type_codes, int num_args, 
TVMValue* out_val,
+                             int out_code, void* res_handle);
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int 
block_out_c, int yi, int xio,

Review Comment:
   If `uint16_t` is a stand-in for qfloat16, is there a typename supplied by 
the Hexagon SDK that might be more descriptive?  
   
   This might not be a valid example, but I'm thinking of something _like_ 
`qhl_q0_t`.
   
   Of if the SDK provides no such type name, could it make sense to just define 
one here?  E.g., `using qfloat16_ptr = uint16_t*;`



##########
src/runtime/hexagon/ops/conv_utils.cc:
##########
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "tvm/runtime/hexagon/ops/conv2d.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+/**
+ * @brief Function to "blockize" the flat input data
+ * The term "blockize" is used to mention that the data is stored in 
non-contiguous blocks
+ *
+ * The input is mapped into the below mentioned layout (notation similar to 
index map used for
+ * transform layout):
+ *
+ * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, 
c%32, w%2
+ *
+ * where AXIS_SEPARATOR represents split up in the physical layout
+ *
+ * @param out Pre-allocated output memory pointer
+ * @param inp_flat Flat input data pointer
+ * @param height
+ * @param width
+ * @param depth
+ */
+void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int 
depth) {
+  auto inp_data = static_cast<uint16_t*>(inp_flat);
+  auto out_data = static_cast<uintptr_t*>(out);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += 8) {
+    for (int cx = 0; cx < width; cx += 4) {
+      for (int cc = 0; cc < depth; cc += 32) {
+        auto block = reinterpret_cast<uint16_t*>(*out_data++);
+        int max_y = std::min(8, height - cy);
+        int max_x = std::min(4, width - cx);
+        int max_c = std::min(32, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              block[xyc_to_sm_16b(y, x, c)] =
+                  inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + 
c)];
+            }
+            for (int c = max_c; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
+          }
+          for (int x = max_x; x < 4; ++x) {
+            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
+          }
+        }
+
+        for (int y = max_y; y < 8; ++y)
+          for (int x = 0; x < 4; ++x)
+            for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0;
+      }  // cc
+    }    // cx
+  }      // cy
+}
+
+/**
+ * @brief Convert back from non-contguous layout to a flat layout
+ *
+ * @param out_float Pre-allocated output memory pointer
+ * @param inp Blockized input data pointer
+ * @param height
+ * @param width
+ * @param depth
+ */
+void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int 
depth) {
+  uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
+  uint16_t* out_data = static_cast<uint16_t*>(out_flat);
+  const int stride_x = depth;
+  const int stride_y = stride_x * width;
+
+  for (int cy = 0; cy < height; cy += 8) {
+    for (int cx = 0; cx < width; cx += 4) {
+      for (int cc = 0; cc < depth; cc += 32) {
+        auto block = reinterpret_cast<uint16_t*>(*inp_data);
+        int max_y = std::min(8, height - cy);
+        int max_x = std::min(4, width - cx);
+        int max_c = std::min(32, depth - cc);
+        for (int y = 0; y < max_y; ++y) {
+          for (int x = 0; x < max_x; ++x) {
+            for (int c = 0; c < max_c; ++c) {
+              out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
+                  block[xyc_to_sm_16b(y, x, c)];
+            }
+          }
+        }
+
+        inp_data++;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Convert the layout of weights from flat to "chunked". The term 
chunked is explained below:
+ *
+ * Weights are packed into the below mentioned layout (notation similar to 
index map):
+ * Since weights cannot be exactly represented into a index map notation, the
+ * base split up is mentioned below with a few gotchas
+ *
+ * lambda h, w, i, o: h//8, w//4, o//32, i//32, h%8, w%4, (i%32)//2, o%32, i%2
+ *
+ * The gotchas are:
+ *  - (w%4) is actually stored in the right to left order, as in 3,2,1,0 
instead of 0,1,2,3
+ *  - The h%8 and (w%4) dimensions are not padded up, leading to chunks of 
different sizes
+ *    (thereby the name "chunked" instead of packed)
+ *  - The thinnest chunk of width is stored first. For example, if a kernel is 
5x5, the first
+ *    chunk along the width has size 1 (representing index 0) and then next 
one has size 4
+ *    representing indices (1,2,3,4)

Review Comment:
   Could we move the code-comments that explain "chunked" and "blockized" to 
just before the first usage of those terms, i.e. near the top of `conv2d.h` or 
into a separate `README.rst` file?  It took me a while to notice this 
explanation because of where it's placed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] cconvey commented on a diff in pull request #12204: [Hexagon] Add Hand written HVX conv2d

Reply via email to