[GitHub] [tvm] junrushao commented on a diff in pull request #12895: [MetaSchedule] UX: Tuning API cleanup & developer ergonomics

GitBox Wed, 28 Sep 2022 16:10:46 -0700


junrushao commented on code in PR #12895:
URL: https://github.com/apache/tvm/pull/12895#discussion_r982931215



##########
src/meta_schedule/schedule_rule/schedule_rule.cc:
##########
@@ -51,6 +51,125 @@ ScheduleRule ScheduleRule::PyScheduleRule(
   return ScheduleRule(n);
 }
 
+Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
+  return {
+      ScheduleRule::AutoInline(
+          /*into_producer=*/false,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/true,
+          /*require_injective=*/true,
+          /*require_ordered=*/true,
+          /*disallow_op=*/Array<String>{"tir.exp"}),
+      ScheduleRule::AddRFactor(
+          /*max_jobs_per_core=*/16,
+          /*max_innermost_factor=*/Integer(64)),
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSRSRS",
+          /*tile_binds=*/NullOpt,
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/NullOpt,
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/16,
+          /*max_vectorize_extent=*/64,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512},
+          /*unroll_explicit=*/true),
+      ScheduleRule::RandomComputeLocation(),
+  };
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
+  return {
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSSRRSRS",
+          /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", 
"threadIdx.x"},
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
+          /*reuse_read=*/
+          Map<String, ObjectRef>{{"req", String("must")},
+                                 {"levels", Array<Integer>{4}},  //
+                                 {"scope", String("shared")}},
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("must")},
+                                 {"levels", Array<Integer>{3}},  //
+                                 {"scope", String("local")}}),
+      ScheduleRule::AutoInline(
+          /*into_producer=*/true,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/false,
+          /*require_injective=*/false,
+          /*require_ordered=*/false,
+          /*disallow_op=*/Array<String>{}),
+      ScheduleRule::CrossThreadReduction(
+          /*thread_extents=*/Array<Integer>{4, 8, 16, 32, 64, 128, 256, 512}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/-1,
+          /*max_vectorize_extent=*/-1,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512, 1024},
+          /*unroll_explicit=*/true),
+      ScheduleRule::AutoBind(
+          /*max_threadblocks=*/256,
+          /*thread_extents*/ Array<Integer>{32, 64, 128, 256, 512, 1024}),
+  };
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
+  Array<Map<String, String>> intrin_groups = {
+      {
+          {"init", "wmma_fill_16x16x16_f16"},
+          {"load_a", "wmma_load_16x16x16_f16_a"},
+          {"load_b", "wmma_load_16x16x16_f16_b"},
+          {"compute", "wmma_sync_16x16x16_f16f16f16"},
+          {"store", "wmma_store_16x16x16_f16_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_f16"},
+          {"load_a", "wmma_load_16x16x16_f16_a"},
+          {"load_b", "wmma_load_16x16x16_f16_b_trans"},
+          {"compute", "wmma_sync_16x16x16_f16f16f16_trans"},
+          {"store", "wmma_store_16x16x16_f16_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_s32"},
+          {"load_a", "wmma_load_16x16x16_s8_a"},
+          {"load_b", "wmma_load_16x16x16_s8_b"},
+          {"compute", "wmma_sync_16x16x16_s8s8s32"},
+          {"store", "wmma_store_16x16x16_s32_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_s32"},
+          {"load_a", "wmma_load_16x16x16_s8_a"},
+          {"load_b", "wmma_load_16x16x16_s8_b_trans"},
+          {"compute", "wmma_sync_16x16x16_s8s8s32_trans"},
+          {"store", "wmma_store_16x16x16_s32_shared"},
+      },
+  };
+  Array<ScheduleRule> results{ScheduleRule::MultiLevelTilingTensorCore(
+      /*intrin_groups=*/intrin_groups,
+      /*structure=*/"SSSRRSRS",
+      /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", "threadIdx.x"},
+      /*max_innermost_factor=*/Integer(64),
+      /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
+      /*reuse_read=*/
+      Map<String, ObjectRef>{{"req", String("must")},
+                             {"levels", Array<Integer>{4}},  //
+                             {"scope", String("shared")}},
+      /*reuse_write=*/
+      Map<String, ObjectRef>{{"req", String("must")},
+                             {"levels", Array<Integer>{3}},  //
+                             {"scope", String("local")}},

Review Comment:
   ooop. would you mind pointing out the correct one? thanks a lot!



##########
src/meta_schedule/schedule_rule/schedule_rule.cc:
##########
@@ -51,6 +51,125 @@ ScheduleRule ScheduleRule::PyScheduleRule(
   return ScheduleRule(n);
 }
 
+Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
+  return {
+      ScheduleRule::AutoInline(
+          /*into_producer=*/false,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/true,
+          /*require_injective=*/true,
+          /*require_ordered=*/true,
+          /*disallow_op=*/Array<String>{"tir.exp"}),
+      ScheduleRule::AddRFactor(
+          /*max_jobs_per_core=*/16,
+          /*max_innermost_factor=*/Integer(64)),
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSRSRS",
+          /*tile_binds=*/NullOpt,
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/NullOpt,
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/16,
+          /*max_vectorize_extent=*/64,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512},
+          /*unroll_explicit=*/true),
+      ScheduleRule::RandomComputeLocation(),
+  };
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
+  return {
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSSRRSRS",
+          /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", 
"threadIdx.x"},
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
+          /*reuse_read=*/
+          Map<String, ObjectRef>{{"req", String("must")},
+                                 {"levels", Array<Integer>{4}},  //
+                                 {"scope", String("shared")}},
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("must")},
+                                 {"levels", Array<Integer>{3}},  //
+                                 {"scope", String("local")}}),
+      ScheduleRule::AutoInline(
+          /*into_producer=*/true,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/false,
+          /*require_injective=*/false,
+          /*require_ordered=*/false,
+          /*disallow_op=*/Array<String>{}),
+      ScheduleRule::CrossThreadReduction(
+          /*thread_extents=*/Array<Integer>{4, 8, 16, 32, 64, 128, 256, 512}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/-1,
+          /*max_vectorize_extent=*/-1,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512, 1024},
+          /*unroll_explicit=*/true),
+      ScheduleRule::AutoBind(
+          /*max_threadblocks=*/256,
+          /*thread_extents*/ Array<Integer>{32, 64, 128, 256, 512, 1024}),
+  };
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
+  Array<Map<String, String>> intrin_groups = {
+      {
+          {"init", "wmma_fill_16x16x16_f16"},
+          {"load_a", "wmma_load_16x16x16_f16_a"},
+          {"load_b", "wmma_load_16x16x16_f16_b"},
+          {"compute", "wmma_sync_16x16x16_f16f16f16"},
+          {"store", "wmma_store_16x16x16_f16_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_f16"},
+          {"load_a", "wmma_load_16x16x16_f16_a"},
+          {"load_b", "wmma_load_16x16x16_f16_b_trans"},
+          {"compute", "wmma_sync_16x16x16_f16f16f16_trans"},
+          {"store", "wmma_store_16x16x16_f16_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_s32"},
+          {"load_a", "wmma_load_16x16x16_s8_a"},
+          {"load_b", "wmma_load_16x16x16_s8_b"},
+          {"compute", "wmma_sync_16x16x16_s8s8s32"},
+          {"store", "wmma_store_16x16x16_s32_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_s32"},
+          {"load_a", "wmma_load_16x16x16_s8_a"},
+          {"load_b", "wmma_load_16x16x16_s8_b_trans"},
+          {"compute", "wmma_sync_16x16x16_s8s8s32_trans"},
+          {"store", "wmma_store_16x16x16_s32_shared"},
+      },
+  };
+  Array<ScheduleRule> results{ScheduleRule::MultiLevelTilingTensorCore(
+      /*intrin_groups=*/intrin_groups,
+      /*structure=*/"SSSRRSRS",
+      /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", "threadIdx.x"},
+      /*max_innermost_factor=*/Integer(64),
+      /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
+      /*reuse_read=*/
+      Map<String, ObjectRef>{{"req", String("must")},
+                             {"levels", Array<Integer>{4}},  //
+                             {"scope", String("shared")}},
+      /*reuse_write=*/
+      Map<String, ObjectRef>{{"req", String("must")},
+                             {"levels", Array<Integer>{3}},  //
+                             {"scope", String("local")}},

Review Comment:
   ooops! would you mind pointing out the correct one? thanks a lot!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] junrushao commented on a diff in pull request #12895: [MetaSchedule] UX: Tuning API cleanup & developer ergonomics

Reply via email to