cconvey commented on code in PR #12204:
URL: https://github.com/apache/tvm/pull/12204#discussion_r949617801


##########
include/tvm/runtime/hexagon/ops/conv2d.h:
##########
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+
+#include <cassert>
+
+#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+static constexpr auto hexagon_device = 
DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};
+
+// Standalone DLTensor: the standalone-ness means that this object owns the 
shape
+// (as opposed to a DLTensor).
+template <size_t NDIM>
+class SDLTensor : public DLTensor {
+ public:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const 
int64_t* data_dims)
+      : SDLTensor(data_ptr, data_type, data_space) {
+    for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i];
+  }
+
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space,
+            std::initializer_list<int64_t> data_dims)
+      : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {}
+
+  void* GetDataSpace() const { return data_space; }
+
+ private:
+  SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : 
data_space(data_space) {
+    data = data_ptr;
+    device = hexagon_device;
+    ndim = NDIM;
+    dtype = data_type;
+    shape = dims;
+    strides = nullptr;
+    byte_offset = 0;
+  }
+
+  void* data_space = nullptr;
+  int64_t dims[NDIM];
+};
+
+inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); 
}
+
+constexpr int xyc_to_sm_16b(int y, int x, int c) {
+  // Map y,x,c coordinates within a block to the offset (in 16-bit elements)
+  // from the beginning of the block in spatial-major layout.
+  // 10-bit spatial mask: yyyxcccccx
+  assert(y >= 0 && x >= 0 && c >= 0);
+  return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
+}
+
+constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
+  // Map y,x,i,o coordinates within a chunk (assuming the origin at the
+  // top-left spatial corner) to the offset (in 16-bit elements) from the
+  // beginning of the chunk in spatial-major layout.
+  // Spatial mask: p..piiiioooooi, where p..p are position bits.
+  assert(width >= 1);
+  assert(y >= 0 && x >= 0 && i >= 0 && o >= 0);
+  int p = y * width + (width - 1 - x);
+  return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1);
+}
+
+inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; }
+
+inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) {
+  if (y < 0 || y >= a.shape[1]) return uintptr_t(0);
+  auto p = static_cast<uintptr_t*>(a.data);
+  assert(n == 0);
+  return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c];
+}
+
+inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
+  auto p = static_cast<uintptr_t*>(f.data);
+  return p[y * f.shape[1] * f.shape[2] * f.shape[3] + x * f.shape[2] * 
f.shape[3] + i * f.shape[3] +
+           o];
+}

Review Comment:
   That makes sense.  If we consider speed to be a primary concern, would any 
of the following be a good compromise?
   - convert these to members of `SDLTensor`, and (optionally) only provide 
implementations of the two methods for the template specialization `NDIM=4`, or
   - leave these as free-standing functions, but still templatize them and only 
provide implementations for `NDIM=4`, or
   - add a few comments indicating the assumptions these functions make about 
the rank of `a` / `f`, or
   - adjust the names of these two functions to indicate their assumption about 
the rank of `a` / `f`.
   
   The only way I'd see the templatization approaches be worth the additional 
code complexity is if:
   - you're already planning in converting these to methods of `SDLTensor`, 
which is already a template class, or
   - you have future plans for this code that would involve values of `NDIM` 
other than 4.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to