20 typename ComputeDataType,
24 typename AMmaTileDesc,
25 typename BMmaTileDesc,
26 index_t ABlockTransferSrcScalarPerVector,
27 index_t BBlockTransferSrcScalarPerVector,
43 typename ComputeDataType,
47 typename AMmaTileDesc,
48 typename BMmaTileDesc,
49 index_t ABlockTransferSrcScalarPerVector,
50 index_t BBlockTransferSrcScalarPerVector,
71 ABlockTransferSrcScalarPerVector,
72 BBlockTransferSrcScalarPerVector,
90 ABlockTransferSrcScalarPerVector,
91 BBlockTransferSrcScalarPerVector,
111 ABlockTransferSrcScalarPerVector,
112 BBlockTransferSrcScalarPerVector,
150 (MPerBlock *
sizeof(ADataType) + NPerBlock *
sizeof(BDataType)) * KPerBlock);
200 template <
bool HasMainLoop,
204 typename ABlockTransfer,
205 typename AGridBuffer,
206 typename ABlockBuffer,
207 typename ABlockTransferStep,
210 typename BBlockTransfer,
211 typename BGridBuffer,
212 typename BBlockBuffer,
213 typename BBlockTransferStep,
214 typename CThreadBuffer>
215 __device__
void Run(
const AGridDesc& a_grid_desc,
216 const ABlockDesc& a_block_desc,
217 ABlockTransfer& a_blockwise_copy,
218 const AGridBuffer& a_grid_buf,
219 ABlockBuffer& a_block_buf,
220 const ABlockTransferStep& a_block_copy_step,
221 const BGridDesc& b_grid_desc,
222 const BBlockDesc& b_block_desc,
223 BBlockTransfer& b_blockwise_copy,
224 const BGridBuffer& b_grid_buf,
225 BBlockBuffer& b_block_buf,
226 const BBlockTransferStep& b_block_copy_step,
227 CThreadBuffer& c_thread_buf,
236 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf,
I0);
237 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf,
I0);
239 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
240 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
243 c_thread_buf.Clear();
246 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf,
I0);
247 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf,
I0);
251 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
252 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
254 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
255 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
259 if constexpr(HasMainLoop)
294 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
297 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
302 using mfma_input_type =
310 a_thread_vec.template AsType<mfma_input_type>(),
311 b_thread_vec.template AsType<mfma_input_type>(),
318 a_blockwise_copy.RunWrite(
320 b_blockwise_copy.RunWrite(
323 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
324 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
326 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
327 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
336 auto LoopTailFunc = [&](
auto tail_num) {
365 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
368 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
373 using mfma_input_type =
381 a_thread_vec.template AsType<mfma_input_type>(),
382 b_thread_vec.template AsType<mfma_input_type>(),
389 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, iprefetch);
390 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, iprefetch);
420 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
423 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
428 using mfma_input_type =
434 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
435 b_thread_vec.template AsType<mfma_input_type>(),
471 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
474 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
479 using mfma_input_type =
485 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
486 b_thread_vec.template AsType<mfma_input_type>(),
523 using Base::a_thread_copy_;
524 using Base::a_thread_desc_;
525 using Base::b_thread_copy_;
526 using Base::b_thread_desc_;
527 using Base::c_thread_desc_;
533 typename ComputeDataType,
534 typename AccDataType,
537 typename AMmaTileDesc,
538 typename BMmaTileDesc,
539 index_t ABlockTransferSrcScalarPerVector,
540 index_t BBlockTransferSrcScalarPerVector,
561 ABlockTransferSrcScalarPerVector,
562 BBlockTransferSrcScalarPerVector,
580 ABlockTransferSrcScalarPerVector,
581 BBlockTransferSrcScalarPerVector,
601 ABlockTransferSrcScalarPerVector,
602 BBlockTransferSrcScalarPerVector,
644 (MPerBlock *
sizeof(ADataType) + NPerBlock *
sizeof(BDataType)) * KPerBlock);
694 template <
bool HasMainLoop,
698 typename ABlockTransfer,
699 typename AGridBuffer,
700 typename ABlockBuffer,
701 typename ABlockTransferStep,
704 typename BBlockTransfer,
705 typename BGridBuffer,
706 typename BBlockBuffer,
707 typename BBlockTransferStep,
708 typename CThreadBuffer,
709 typename BScaleGridBuffer,
710 typename BScaleGridDesc,
711 typename BScaleThreadDesc,
712 typename BScaleThreadTransfer,
713 typename BScaleThreadTransferStep>
714 __device__
void Run(
const AGridDesc& a_grid_desc,
715 const ABlockDesc& a_block_desc,
716 ABlockTransfer& a_blockwise_copy,
717 const AGridBuffer& a_grid_buf,
718 ABlockBuffer& a_block_buf,
719 const ABlockTransferStep& a_block_copy_step,
720 const BGridDesc& b_grid_desc,
721 const BBlockDesc& b_block_desc,
722 BBlockTransfer& b_blockwise_copy,
723 const BGridBuffer& b_grid_buf,
724 BBlockBuffer& b_block_buf,
725 const BBlockTransferStep& b_block_copy_step,
726 CThreadBuffer& c_thread_buf,
727 const BScaleGridDesc& b_scale_grid_desc,
729 const BScaleThreadDesc& b_scale_thread_desc,
730 BScaleThreadTransfer& b_scale_thread_copy,
731 const BScaleGridBuffer& b_scale_grid_buf,
732 const BScaleThreadTransferStep& b_scale_thread_copy_step,
735 index_t num_loop_per_scale)
const
737 ignore = num_loop_per_scale;
745 b_scale_thread_desc.GetElementSpaceSize());
748 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf,
I0);
749 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf,
I0);
751 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
752 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
755 b_scale_thread_copy.Run(b_scale_grid_desc,
761 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
762 b_scale_thread_copy_step.At(
Number<0>{}));
764 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
765 b_scale_thread_copy_step.At(
Number<1>{}));
768 c_thread_buf.Clear();
771 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf,
I0);
772 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf,
I0);
776 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
777 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
779 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
780 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
786 if constexpr(HasMainLoop)
812 __builtin_amdgcn_sched_barrier(0);
820 if constexpr(k0.value != 0 ||
KRepeat == 1)
822 __builtin_amdgcn_s_barrier();
823 __builtin_amdgcn_sched_barrier(0);
832 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
835 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
840 using mfma_input_type =
853 if constexpr(k0.value ==
KRepeat - 1 &&
855 m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
857 __builtin_amdgcn_sched_barrier(0);
859 __builtin_amdgcn_sched_barrier(0);
862 a_thread_vec.template AsType<mfma_input_type>(),
863 b_thread_vec.template AsType<mfma_input_type>(),
865 if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
867 __builtin_amdgcn_sched_barrier(0);
868 __builtin_amdgcn_s_setprio(1);
869 __builtin_amdgcn_sched_barrier(0);
883 __builtin_amdgcn_sched_barrier(0);
884 __builtin_amdgcn_s_setprio(0);
885 __builtin_amdgcn_sched_barrier(0);
902 a_blockwise_copy.RunWrite(
904 b_blockwise_copy.RunWrite(
907 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, iprefetch);
908 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, iprefetch);
910 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
911 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
919 auto LoopTailFunc = [&](
auto tail_num) {
940 __builtin_amdgcn_sched_barrier(0);
941 if constexpr(k0.value != 0 ||
KRepeat == 1)
943 __builtin_amdgcn_s_barrier();
944 __builtin_amdgcn_sched_barrier(0);
953 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
956 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
961 using mfma_input_type =
968 if constexpr(k0.value ==
KRepeat - 1 &&
970 m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
972 __builtin_amdgcn_sched_barrier(0);
974 __builtin_amdgcn_sched_barrier(0);
977 a_thread_vec.template AsType<mfma_input_type>(),
978 b_thread_vec.template AsType<mfma_input_type>(),
980 if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
982 __builtin_amdgcn_sched_barrier(0);
983 __builtin_amdgcn_s_setprio(1);
984 __builtin_amdgcn_sched_barrier(0);
997 __builtin_amdgcn_sched_barrier(0);
998 __builtin_amdgcn_s_setprio(0);
999 __builtin_amdgcn_sched_barrier(0);
1015 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, iprefetch);
1016 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, iprefetch);
1037 __builtin_amdgcn_sched_barrier(0);
1038 if constexpr(k0.value != 0 ||
KRepeat == 1)
1040 __builtin_amdgcn_s_barrier();
1041 __builtin_amdgcn_sched_barrier(0);
1050 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1053 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1058 using mfma_input_type =
1065 if constexpr(k0.value ==
KRepeat - 1 &&
1067 m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
1069 __builtin_amdgcn_sched_barrier(0);
1071 __builtin_amdgcn_sched_barrier(0);
1074 a_thread_vec.template AsType<mfma_input_type>(),
1075 b_thread_vec.template AsType<mfma_input_type>(),
1077 if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
1079 __builtin_amdgcn_sched_barrier(0);
1080 __builtin_amdgcn_s_setprio(1);
1081 __builtin_amdgcn_sched_barrier(0);
1094 __builtin_amdgcn_sched_barrier(0);
1095 __builtin_amdgcn_s_setprio(0);
1096 __builtin_amdgcn_sched_barrier(0);
1121 __builtin_amdgcn_sched_barrier(0);
1122 if constexpr(k0.value != 0 ||
KRepeat == 1)
1124 __builtin_amdgcn_s_barrier();
1125 __builtin_amdgcn_sched_barrier(0);
1134 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1137 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1142 using mfma_input_type =
1149 if constexpr(k0.value ==
KRepeat - 1 &&
1151 m0.value == MRepeat - 1 && n0.value == NRepeat - 1)
1153 __builtin_amdgcn_sched_barrier(0);
1155 __builtin_amdgcn_sched_barrier(0);
1158 a_thread_vec.template AsType<mfma_input_type>(),
1159 b_thread_vec.template AsType<mfma_input_type>(),
1161 if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
1163 __builtin_amdgcn_sched_barrier(0);
1164 __builtin_amdgcn_s_setprio(1);
1165 __builtin_amdgcn_sched_barrier(0);
1178 __builtin_amdgcn_sched_barrier(0);
1179 __builtin_amdgcn_s_setprio(0);
1180 __builtin_amdgcn_sched_barrier(0);
1251 using Base::c_thread_desc_;
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS
Definition ck.hpp:209
__host__ __device__ constexpr T max(T x)
Definition utility/math.hpp:84
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ One
Definition blkgemmpipe_scheduler.hpp:37
@ Seven
Definition blkgemmpipe_scheduler.hpp:43
@ Four
Definition blkgemmpipe_scheduler.hpp:40
@ Two
Definition blkgemmpipe_scheduler.hpp:38
@ Full
Definition blkgemmpipe_scheduler.hpp:49
@ Three
Definition blkgemmpipe_scheduler.hpp:39
@ Five
Definition blkgemmpipe_scheduler.hpp:41
@ Six
Definition blkgemmpipe_scheduler.hpp:42
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
@ Interwave
Definition blkgemmpipe_scheduler.hpp:27
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ void block_sync_lds()
Definition synchronization.hpp:16
__host__ __device__ BlockwiseGemmXdlops_pipeline_base(Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex())
Constructor for BlockwiseGemmXdlops_pipeline_base.
Definition blockwise_gemm_pipeline_xdlops_base.hpp:222
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:280
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:239
static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
conditional_t< std::is_same< ComputeDataType, ck::tf32_t >::value, float, ComputeDataType > ComputeDataTypeBuf
Definition blockwise_gemm_pipeline_xdlops_base.hpp:57
static __device__ auto CalculateBThreadOriginDataIndex()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:147
static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:360
static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
__host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:266
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:294
static constexpr index_t AMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:60
static __device__ auto CalculateAThreadOriginDataIndex()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:125
static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_xdlops_base.hpp:46
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:253
static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:51
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:111
static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:160
static __device__ auto CalculateCThreadOriginDataIndex8D(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:189
static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:50
static constexpr index_t BMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:61
__host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:341
static constexpr index_t KPerThread
Definition blockwise_gemm_pipeline_xdlops_base.hpp:63
__host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:307
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:324
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_pipeline_base< BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:102
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:151
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:215
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:360
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum __host__ static __device__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:164
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:402
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop __host__ static __device__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:159
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_xdlops_base.hpp:46
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeDataTypeBuf typename Base::ComputeDataTypeBuf ComputeDataTypeBuf
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:144
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:157
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::FullMemBandPrefetchStages static constexpr index_t FullMemBandPrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:148
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:366
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WgpPerCU static constexpr index_t WgpPerCU
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:146
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:372
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:401
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:156
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:1222
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::FullMemBandPrefetchStages static constexpr index_t FullMemBandPrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:642
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:638
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:650
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_pipeline_base< BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:592
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BThreadCopy ThreadwiseTensorSliceTransfer_v4< BDataType, ComputeDataTypeBuf, decltype(b_block_desc_n0_n1_n2_k), decltype(b_thread_desc_), Sequence< 1, 1, 1, KPerInnerLoop >, Sequence< 0, 1, 2, 3 >, 3, B_K1, B_K1 > BThreadCopy
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:1239
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:360
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:1215
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeDataTypeBuf typename Base::ComputeDataTypeBuf ComputeDataTypeBuf
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:634
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum __host__ static __device__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:658
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_xdlops_base.hpp:46
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NumMacClusters static constexpr index_t NumMacClusters
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:636
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::B_K1 static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:51
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WgpPerCU static constexpr index_t WgpPerCU
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:640
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:651
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:1250
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:1249
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AThreadCopy ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeDataTypeBuf, decltype(a_block_desc_m0_m1_m2_k), decltype(a_thread_desc_), Sequence< 1, 1, 1, KPerInnerLoop >, Sequence< 0, 1, 2, 3 >, 3, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:1229
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, const BScaleGridDesc &b_scale_grid_desc, const BScaleThreadDesc &b_scale_thread_desc, BScaleThreadTransfer &b_scale_thread_copy, const BScaleGridBuffer &b_scale_grid_buf, const BScaleThreadTransferStep &b_scale_thread_copy_step, index_t num_loop, index_t num_loop_per_scale) const
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:714
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:50
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop __host__ static __device__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:653
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KPerThread static constexpr index_t KPerThread
Definition blockwise_gemm_pipeline_xdlops_base.hpp:63
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KPerInnerLoop static constexpr index_t KPerInnerLoop
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:637
ck::BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:645
Definition blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp:37
Definition utility/sequence.hpp:43
Definition threadwise_tensor_slice_transfer.hpp:1260
Definition functional2.hpp:33
Definition dtype_vector.hpp:10