added clone as an operator. This allows oeprators for the filter in (#…

…233) convolution. Co-authored-by: jluitjens <[email protected]>
NVIDIA · Jul 28, 2022 · 33a0937 · 33a0937
1 parent a134c3d
commit 33a0937
Show file tree

Hide file tree

Showing 4 changed files with 311 additions and 9 deletions.
diff --git a/docs_input/api/tensorops.rst b/docs_input/api/tensorops.rst
@@ -91,3 +91,4 @@ Advanced Operators
 .. doxygenfunction:: remap
 .. doxygenfunction:: rcollapse
 .. doxygenfunction:: lcollapse
+.. doxygenfunction:: clone
diff --git a/include/matx_conv.h b/include/matx_conv.h
@@ -40,6 +40,7 @@
 #include "kernels/matx_conv_kernels.cuh"
 #include "matx_error.h"
 #include "matx_tensor.h"
+#include "matx_tensor_ops.h"
 
 namespace matx {
 namespace detail {
@@ -165,25 +166,61 @@ inline void conv1d_impl(OutputType &o, const In1Type &i1, const In2Type &i2,
 template <typename OutputType, typename In1Type, typename In2Type>
 inline void conv1d(OutputType &o, const In1Type &i1, const In2Type &i2,
                    matxConvCorrMode_t mode, cudaStream_t stream) {
-  if constexpr ( In2Type::Rank()==1 && In1Type::Rank() >  In2Type::Rank() ) {
-    //broadcast path.  clone In2 across entire batch
+  if constexpr ( In1Type::Rank() >  In2Type::Rank() ) {
+    // broadcast i2 path.  clone i2 across batches
 
-    const int Rank = In1Type::Rank();
-    typename In2Type::shape_type shape[Rank];
+    constexpr int LRank = In1Type::Rank();
+    constexpr int SRank = In2Type::Rank();
+    constexpr int DRank = LRank - SRank;
 
+    index_t shape[LRank];
+
+    // copy left-most dimensions from i1
     #pragma unroll
-    for(int i = 0; i < Rank-1; i++) {
+    for(int i = 0; i < DRank; i++) {
       shape[i] = i1.Size(i);
     }
+
+    // set right most dimensions as matxKeepDim
+    #pragma unroll
+    for(int i = 0; i < SRank; i++) {
+      shape[DRank+i] = matxKeepDim;
+    }
 
-    shape[Rank-1] = matxKeepDim;
-
-    auto ci2 = (i2.template Clone<Rank>(shape));
+    // clone i2
+    auto ci2 = clone<LRank>(i2, shape);
 
     static_assert(i1.Rank() == ci2.Rank());
 
     conv1d_impl(o, i1, ci2, mode, stream);
 
+  }  else if constexpr ( In2Type::Rank() >  In1Type::Rank()) {
+    // broadcast i1 path.  clone i1 across batches
+
+    constexpr int LRank = In2Type::Rank();
+    constexpr int SRank = In1Type::Rank();
+    constexpr int DRank = LRank - SRank;
+    index_t shape[LRank];
+
+    // copy left-most dimensions from i2
+    #pragma unroll
+    for(int i = 0; i < DRank; i++) {
+      shape[i] = i2.Size(i);
+    }
+
+    // set right most dimensions as matxKeepDim
+    #pragma unroll
+    for(int i = 0; i < SRank; i++) {
+      shape[DRank+i] = matxKeepDim;
+    }
+
+    // clone i1
+    auto ci1 = clone<LRank>(i1, shape);
+
+    static_assert(ci1.Rank() == i2.Rank());
+
+    conv1d_impl(o, ci1, i2, mode, stream);
+
   } else {
     static_assert(In1Type::Rank() == In2Type::Rank());
     // batched pass outer dims must match

diff --git a/include/matx_tensor_ops.h b/include/matx_tensor_ops.h
@@ -565,6 +565,84 @@ __MATX_INLINE__
     }
   };
   }   
+
+  namespace detail {
+  template <int CRank, typename T, typename Ind>
+  class CloneOp : public BaseOp<CloneOp<CRank, T, Ind>>
+  {
+    private:
+      mutable typename base_type<T>::type op_;
+      std::array<index_t, CRank> sizes_;         // size of each dimension after cloning
+      std::array<index_t, T::Rank()> dims_;      // gather map for computing operator() indices
+    public:
+      using matxop = bool;
+
+      using scalar_type = typename T::scalar_type;
+
+      __MATX_INLINE__ CloneOp(T op, std::array<index_t, CRank> shape) : op_(op) {
+        // create gather list
+        int d = 0;
+        for(int i = 0; i < Rank(); i++) {
+          if(shape[i]==matxKeepDim) {
+            sizes_[i] = op_.Size(d);
+            dims_[d++] = i;
+          } else {
+            sizes_[i] = shape[i];
+          }
+        }
+        MATX_ASSERT(d == T::Rank(), matxInvalidDim);
+
+      };
+
+      template <typename... Is>
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const 
+        {
+
+          // convert variadic type to tuple so we can read/update
+          std::array<index_t, Rank()> sind{indices...};
+          std::array<index_t, T::Rank()> gind;
+
+          // gather indices
+          for(int i = 0; i < T::Rank(); i++) {
+            auto idx = dims_[i];
+            gind[i] = sind[idx];
+          }
+
+          return mapply(op_, gind);
+        }
+
+      static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
+      {
+        return CRank;
+      }
+      constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
+      {
+        return sizes_[dim];
+      }
+
+  };
+  }
+
+
+/**
+ * @brief Operator to clone an operator or tensor acorss dimensions
+ *
+ * @tparam Rank the rank of the cloned operator
+ * @tparam T source operator/tensor type
+ * @param t source operator/tensor
+ * @param shape the shape of the cloned operator/tensor.  
+ * Each element is either the size of the cloned dimension or matxKeepDim to be from the source tensor
+ * @return operator to compute the cloned value
+ */
+  template <int Rank, typename Op>
+  auto __MATX_INLINE__ clone(Op t, const index_t (&shape)[Rank])
+  {
+    std::array<index_t, Rank> lshape;
+    for(int i = 0; i < Rank ; i++) {
+      lshape[i]=shape[i];
+    }
+    return detail::CloneOp<Rank, Op, index_t>(t, lshape);
+  };   
 
 /**
  * Remaps elements an operator according to an index array/operator.
@@ -595,7 +673,7 @@ __MATX_INLINE__
       static_assert(sizeof...(Is)==Rank());
       static_assert((std::is_convertible_v<Is, index_t> && ... ));
 
-      // convert variadic type to tupple so we can read/update
+      // convert variadic type to tuple so we can read/update
       std::array<index_t, Rank()> ind{indices...};
       // get current index for dim
       auto i = ind[DIM];

diff --git a/test/00_operators/OperatorTests.cu b/test/00_operators/OperatorTests.cu
@@ -197,6 +197,192 @@ TYPED_TEST(OperatorTestsComplex, AngleOp)
 
   MATX_EXIT_HANDLER();
 }
+TYPED_TEST(OperatorTestsNumericNonComplex, CloneOp)
+{
+  int N = 10;
+  int M = 12;
+  int K = 14;
+
+  MATX_ENTER_HANDLER();
+  { // clone from 0D
+    auto tiv = make_tensor<TypeParam>();
+    auto tov = make_tensor<TypeParam>({N,M,K});
+
+    tiv() = 3;
+
+    auto op = clone<3>(tiv, {N, M, K});
+
+    ASSERT_EQ(op.Size(0), N);
+    ASSERT_EQ(op.Size(1), M);
+    ASSERT_EQ(op.Size(2), K);
+
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(op(n,m,k) , tiv());
+        }
+      }
+    }
+
+    (tov = op).run();
+    cudaDeviceSynchronize();
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(tov(n,m,k) , tiv());
+        }
+      }
+    }
+  }    
+
+  { // clone from 1D
+    auto tiv = make_tensor<TypeParam>({K});
+    auto tov = make_tensor<TypeParam>({N,M,K});
+
+    for(int k = 0; k < K; k++) {
+      tiv(k) = TypeParam(k);
+    }
+
+    auto op = clone<3>(tiv, {N, M, matxKeepDim});
+
+    ASSERT_EQ(op.Size(0), N);
+    ASSERT_EQ(op.Size(1), M);
+    ASSERT_EQ(op.Size(2), K);
+
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(op(n,m,k) , tiv(k));
+        }
+      }
+    }
+
+    (tov = op).run();
+    cudaDeviceSynchronize();
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(tov(n,m,k) , tiv(k));
+        }
+      }
+    }
+  }    
+
+  { // clone from 1D
+    auto tiv = make_tensor<TypeParam>({M});
+    auto tov = make_tensor<TypeParam>({N,M,K});
+
+    for(int m = 0; m < K; m++) {
+      tiv(m) = TypeParam(m);
+    }
+
+    auto op = clone<3>(tiv, {N, matxKeepDim, K});
+
+    ASSERT_EQ(op.Size(0), N);
+    ASSERT_EQ(op.Size(1), M);
+    ASSERT_EQ(op.Size(2), K);
+
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(op(n,m,k) , tiv(m));
+        }
+      }
+    }
+
+    (tov = op).run();
+    cudaDeviceSynchronize();
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(tov(n,m,k) , tiv(m));
+        }
+      }
+    }
+  }    
+
+  { // clone from 2D and operator
+    auto tiv = make_tensor<TypeParam>({M,K});
+    auto tov = make_tensor<TypeParam>({N,M,K});
+
+    for(int m = 0; m < M; m++) {
+      for(int k = 0; k < K; k++) {
+        tiv(m,k) = TypeParam(m*K)+TypeParam(k);
+      }
+    }
+
+    auto op = clone<3>(tiv, {N, matxKeepDim, matxKeepDim});
+
+    ASSERT_EQ(op.Size(0), N);
+    ASSERT_EQ(op.Size(1), M);
+    ASSERT_EQ(op.Size(2), K);
+
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(op(n,m,k) , tiv(m,k));
+        }
+      }
+    }
+
+    (tov = op).run();
+    cudaDeviceSynchronize();
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(tov(n,m,k) , tiv(m,k));
+        }
+      }
+    }
+  }    
+
+  { // clone from 2D
+    auto tiv = make_tensor<TypeParam>({M,K});
+    auto tov = make_tensor<TypeParam>({N,M,K});
+
+    for(int m = 0; m < M; m++) {
+      for(int k = 0; k < K; k++) {
+        tiv(m,k) = TypeParam(m*K)+TypeParam(k);
+      }
+    }
+
+    auto op = clone<3>(TypeParam(2)*tiv, {N, matxKeepDim, matxKeepDim});
+
+    ASSERT_EQ(op.Size(0), N);
+    ASSERT_EQ(op.Size(1), M);
+    ASSERT_EQ(op.Size(2), K);
+
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(op(n,m,k) , TypeParam(2)*tiv(m,k));
+        }
+      }
+    }
+
+    (tov = op).run();
+    cudaDeviceSynchronize();
+
+    for(int n = 0; n < N; n++) {
+      for(int m = 0; m < M; m++) {
+        for(int k = 0; k < K; k++) {
+          ASSERT_EQ(tov(n,m,k) , TypeParam(2)*tiv(m,k));
+        }
+      }
+    }
+  }    
+
+  MATX_EXIT_HANDLER();
+}
 
 TYPED_TEST(OperatorTestsNumericNonComplex, CollapseOp)
 {