Merge pull request #11 from rohany/update-summa

legion/summaMM: update summa generation and driver code
tensor-compiler · Jun 6, 2021 · 8a3ea89 · 8a3ea89
2 parents 7f780f3 + f91752b
commit 8a3ea89
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 20 deletions.
diff --git a/legion/summaMM/main.cpp b/legion/summaMM/main.cpp
@@ -4,31 +4,49 @@
 
 using namespace Legion;
 
-typedef int32_t valType;
+typedef double valType;
 
 // Defined by the generated TACO code.
 void registerTacoTasks();
-LogicalPartition placeLegionA(Context ctx, Runtime* runtime, LogicalRegion a);
-LogicalPartition placeLegionB(Context ctx, Runtime* runtime, LogicalRegion b);
-LogicalPartition placeLegionC(Context ctx, Runtime* runtime, LogicalRegion c);
+LogicalPartition placeLegionA(Context ctx, Runtime* runtime, LogicalRegion a, int gx, int gy);
+LogicalPartition placeLegionB(Context ctx, Runtime* runtime, LogicalRegion b, int gx, int gy);
+LogicalPartition placeLegionC(Context ctx, Runtime* runtime, LogicalRegion c, int gx, int gy);
 void computeLegion(Context ctx, Runtime* runtime, LogicalRegion a, LogicalRegion b, LogicalRegion c, LogicalPartition aPartition);
 
 void top_level_task(const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx, Runtime* runtime) {
   // Create the regions.
   auto args = runtime->get_input_args();
   int n = -1;
+  int gx = -1;
+  int gy = -1;
   // Parse input args.
   for (int i = 1; i < args.argc; i++) {
     if (strcmp(args.argv[i], "-n") == 0) {
       n = atoi(args.argv[++i]);
       continue;
     }
+    if (strcmp(args.argv[i], "-gx") == 0) {
+      gx = atoi(args.argv[++i]);
+      continue;
+    }
+    if (strcmp(args.argv[i], "-gy") == 0) {
+      gy = atoi(args.argv[++i]);
+      continue;
+    }
     // TODO (rohany): Add a flag to do the validation or not.
   }
   if (n == -1) {
     std::cout << "Please provide an input matrix size with -n." << std::endl;
     return;
   }
+  if (gx == -1) {
+    std::cout << "Please provide a grid x size with -gx." << std::endl;
+    return;
+  }
+  if (gy == -1) {
+    std::cout << "Please provide a gris y size with -gy." << std::endl;
+    return;
+  }
 
   auto fspace = runtime->create_field_space(ctx);
   allocate_tensor_fields<valType>(ctx, runtime, fspace);
@@ -39,21 +57,14 @@ void top_level_task(const Task* task, const std::vector<PhysicalRegion>& regions
   tacoFill<valType>(ctx, runtime, A, 0); tacoFill<valType>(ctx, runtime, B, 1); tacoFill<valType>(ctx, runtime, C, 1);
 
   // Place the tensors.
-  auto part = placeLegionA(ctx, runtime, A);
-  placeLegionB(ctx, runtime, B);
-  placeLegionC(ctx, runtime, C);
+  auto part = placeLegionA(ctx, runtime, A, gx, gy);
+  placeLegionB(ctx, runtime, B, gx, gy);
+  placeLegionC(ctx, runtime, C, gx, gy);
 
   // Compute on the tensors.
   benchmark([&]() { computeLegion(ctx, runtime, A, B, C, part); });
 
-  auto a_reg = getRegionToWrite(ctx, runtime, A, A);
-  FieldAccessor<READ_WRITE,valType,2,coord_t, Realm::AffineAccessor<valType, 2, coord_t>> a_rw(a_reg, FID_VAL);
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      assert(a_rw[Point<2>(i, j)] == n);
-    }
-  }
-  runtime->unmap_region(ctx, a_reg);
+  tacoValidate<valType>(ctx, runtime, A, valType(n));
 }
 
 TACO_MAIN(valType)
diff --git a/test/tests-distributed.cpp b/test/tests-distributed.cpp
@@ -119,16 +119,18 @@ TEST(distributed, basicComputeOnto) {
 
 TEST(distributed, summaMM) {
   int dim = 10;
-  Tensor<int> a("a", {dim, dim}, Format{Dense, Dense});
-  Tensor<int> b("b", {dim, dim}, Format{Dense, Dense});
-  Tensor<int> c("c", {dim, dim}, Format{Dense, Dense});
+  Tensor<double> a("a", {dim, dim}, Format{Dense, Dense});
+  Tensor<double> b("b", {dim, dim}, Format{Dense, Dense});
+  Tensor<double> c("c", {dim, dim}, Format{Dense, Dense});
 
   IndexVar i("i"), j("j"), in("in"), jn("jn"), il("il"), jl("jl"), k("k"), ki("ki"), ko("ko");
 
   a(i, j) = b(i, k) * c(k, j);
 
   // Place each tensor onto a processor grid.
-  auto grid = Grid(2, 2);
+  auto gx = ir::Var::make("gridX", Int32, false, false, true);
+  auto gy = ir::Var::make("gridY", Int32, false, false, true);
+  auto grid = Grid(gx, gy);
   auto placement = GridPlacement({0, 1});
   auto placeA = a.partition(grid).place(grid, placement);
   auto placeB = b.partition(grid).place(grid, placement);
@@ -137,13 +139,15 @@ TEST(distributed, summaMM) {
   auto placeBLowered = lower(placeB, "placeLegionB", false, true);
   auto placeCLowered = lower(placeC, "placeLegionC", false, true);
 
+  std::shared_ptr<LeafCallInterface> gemm = std::make_shared<GEMM>();
   auto stmt = a.getAssignment().concretize();
   stmt = stmt
       .distributeOnto({i, j}, {in, jn}, {il, jl}, a(i, j))
-      .split(k, ko, ki, 256)
+      .split(k, ko, ki, 512)
       .reorder({ko, il, jl})
       .pushCommUnder(b(i, k), ko)
       .pushCommUnder(c(k, j), ko)
+      .swapLeafKernel(il, gemm)
       ;
 
   auto lowered = lower(stmt, "computeLegion", false, true);