amd_buffer_addressing.hpp File Reference

amd_buffer_addressing.hpp File Reference#

Composable Kernel: amd_buffer_addressing.hpp File Reference
amd_buffer_addressing.hpp File Reference

Go to the source code of this file.

Classes

struct  ck_tile::buffer_resource
struct  ck_tile::impl::buffer_load_trait< 16, T >
struct  ck_tile::impl::buffer_load_trait< 8, T >
struct  ck_tile::impl::buffer_load_trait< 4, T >
struct  ck_tile::impl::buffer_load_trait< 2, T >
struct  ck_tile::impl::buffer_load_trait< 1, T >
struct  ck_tile::buffer_load< 16, pre_nop >
struct  ck_tile::buffer_load< 8, pre_nop >
struct  ck_tile::buffer_load< 4, pre_nop >
struct  ck_tile::buffer_load< 2, pre_nop >
struct  ck_tile::buffer_load< 1, pre_nop >
struct  ck_tile::buffer_load_if< 16, pre_nop >
struct  ck_tile::buffer_load_if< 8, pre_nop >
struct  ck_tile::buffer_load_if< 4, pre_nop >
struct  ck_tile::buffer_load_if< 2, pre_nop >
struct  ck_tile::buffer_load_if< 1, pre_nop >
struct  ck_tile::buffer_store< 16 >
struct  ck_tile::buffer_store< 8 >
struct  ck_tile::buffer_store< 4 >
struct  ck_tile::buffer_store< 2 >
struct  ck_tile::buffer_store< 1 >
struct  ck_tile::buffer_store_if< 16 >
struct  ck_tile::buffer_store_if< 8 >
struct  ck_tile::buffer_store_if< 4 >
struct  ck_tile::buffer_store_if< 2 >
struct  ck_tile::buffer_store_if< 1 >
struct  ck_tile::buffer_atomic_add_if< bf16_t, 2, pre_nop >
struct  ck_tile::buffer_atomic_add< bf16_t, 2, pre_nop >
struct  ck_tile::impl::smem_load_trait< 16, T >
struct  ck_tile::impl::smem_load_trait< 8, T >
struct  ck_tile::impl::smem_load_trait< 4, T >
struct  ck_tile::impl::smem_load_trait< 2, T >
struct  ck_tile::impl::smem_load_trait< 1, T >
struct  ck_tile::smem_load< 16 >
struct  ck_tile::smem_load< 8 >
struct  ck_tile::smem_load< 4 >
struct  ck_tile::smem_load< 2 >
struct  ck_tile::smem_load< 1 >

Namespaces

namespace  ck_tile
namespace  ck_tile::impl

Macros

#define LIKELY(x)
#define HAS_RAW_BUFFER_BUILTINS
#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr)
#define BUFFER_LOAD_USE_INLINEASM   0

Typedefs

using as3_uint32_ptr = uint32_t *

Enumerations

enum struct  ck_tile::amd_buffer_coherence_enum {
  ck_tile::coherence_default = 0 ,
  ck_tile::glc = 1 ,
  ck_tile::slc = 2 ,
  ck_tile::glc_slc = 3 ,
  ck_tile::WAVE_NT0 = 0 ,
  ck_tile::WAVE_NT1 = 2 ,
  ck_tile::GROUP_NT0 = 1 ,
  ck_tile::GROUP_NT1 = 3 ,
  ck_tile::DEVICE_NT0 = 8 ,
  ck_tile::DEVICE_NT1 = 10 ,
  ck_tile::SYSTEM_NT0 = 9 ,
  ck_tile::SYSTEM_NT1 = 11
}

Functions

__device__ uint32_t ck_tile::amd_wave_read_first_lane (uint16_t v)
__device__ uint32_t ck_tile::amd_wave_read_first_lane (uint8_t v)
__device__ uint32_t ck_tile::amd_wave_read_first_lane (uint32_t value)
__device__ int32_t ck_tile::amd_wave_read_first_lane (int32_t value)
template<typename Object, std::enable_if_t< std::is_trivially_copyable_v< Object >, int > = 0>
__device__ auto ck_tile::amd_wave_read_first_lane (const Object &obj)
template<typename ForceSGPR = std::false_type>
CK_TILE_DEVICE int32x4_t ck_tile::make_wave_buffer_resource (const void *ptr, uint32_t size=0xffffffff, ForceSGPR={})
CK_TILE_DEVICE void ck_tile::buffer_load_fence (index_t cnt=0)
CK_TILE_DEVICE void ck_tile::lds_load_fence (index_t cnt=0)
template<index_t N>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword (array< float, N > &b)
template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 2 > (array< float, 2 > &b)
template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 3 > (array< float, 3 > &b)
template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 4 > (array< float, 4 > &b)
template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 8 > (array< float, 8 > &b)
template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 16 > (array< float, 16 > &b)
template<>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep_per_dword< 32 > (array< float, 32 > &b)
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep ()
template<typename T>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep (T &buffer)
template<typename Tx, typename... Ty>
CK_TILE_DEVICE void ck_tile::impl::insert_dummy_dep (Tx &bx, Ty &... by)
template<typename... T>
CK_TILE_DEVICE void ck_tile::buffer_load_fence (index_t cnt=0, T &... o)
CK_TILE_DEVICE void ck_tile::buffer_store_fence (index_t cnt=0)
CK_TILE_DEVICE auto ck_tile::async_load_fence_raw (index_t cnt=0)
CK_TILE_DEVICE_EXTERN int8_t ck_tile::llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
CK_TILE_DEVICE_EXTERN int8x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
CK_TILE_DEVICE_EXTERN int8x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
CK_TILE_DEVICE_EXTERN int16_t ck_tile::llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
CK_TILE_DEVICE_EXTERN int16x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
CK_TILE_DEVICE_EXTERN int16x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
CK_TILE_DEVICE_EXTERN int32_t ck_tile::llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
CK_TILE_DEVICE_EXTERN int32x2_t ck_tile::llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
CK_TILE_DEVICE_EXTERN int32x4_t ck_tile::llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
CK_TILE_DEVICE_EXTERN _Float16 ck_tile::llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
CK_TILE_DEVICE_EXTERN fp16x2_t ck_tile::llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
CK_TILE_DEVICE_EXTERN fp16x4_t ck_tile::llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
CK_TILE_DEVICE_EXTERN float ck_tile::llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
CK_TILE_DEVICE_EXTERN fp32x2_t ck_tile::llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
CK_TILE_DEVICE_EXTERN fp32x4_t ck_tile::llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16 (int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16x2 (int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i16x4 (int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16 (uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16x2 (uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_ui16x4 (uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16 (_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp16x4 (fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32x2 (fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_store_fp32x4 (fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
CK_TILE_DEVICE_EXTERN fp16x2_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
CK_TILE_DEVICE_EXTERN bf16x2_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_bf16x2 (bf16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16")
CK_TILE_DEVICE_EXTERN int32_t ck_tile::llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
CK_TILE_DEVICE_EXTERN float ck_tile::llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
CK_TILE_DEVICE_EXTERN double ck_tile::llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
CK_TILE_DEVICE_EXTERN void ck_tile::llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, as3_uint32_ptr lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
template<unsigned num_dwords, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::async_buffer_load_dwordxn_v (void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={})
CK_TILE_DEVICE void ck_tile::async_buffer_load_fence (index_t cnt=0)
template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer< int8_t, N > ck_tile::amd_buffer_load_impl_with_bytes (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer< T, N > ck_tile::amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw_impl (thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_impl (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load (CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={})
template<index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_impl_with_bytes (const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_raw_impl (const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1)
template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add_impl (const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_max_impl (const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer< T, N > ck_tile::amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE thread_buffer< T, N > ck_tile::amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw (thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_load_raw (thread_buffer< T, N > &dst, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t is_valid_element=0, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob_raw (T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob_raw (T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool_constant< pre_nop >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = false>
CK_TILE_DEVICE void ck_tile::amd_async_buffer_load_with_oob (CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={})
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true>
CK_TILE_DEVICE void ck_tile::amd_buffer_store_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t N, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, bool oob_conditional_check = true, bool pre_nop = false>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_add_raw (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={})
template<typename T, index_t N>
CK_TILE_DEVICE void ck_tile::amd_buffer_atomic_max (const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)

Macro Definition Documentation

◆ BUFFER_LOAD_USE_INLINEASM

#define BUFFER_LOAD_USE_INLINEASM   0

◆ CK_TILE_ASYNC_LOAD_WITH_INSTR

#define CK_TILE_ASYNC_LOAD_WITH_INSTR ( instr)
Value:
if constexpr(pre_nop) \
asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
: "=r"(smem) /*dummy dependency for smem*/ \
: "v"(voffset), "s"(rsrc), "n"(ioffset) \
: "memory"); \
else \
asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \
: "=r"(smem) /*dummy dependency for smem*/ \
: "v"(voffset), "s"(rsrc), "n"(ioffset) \
: "memory");

◆ HAS_RAW_BUFFER_BUILTINS

#define HAS_RAW_BUFFER_BUILTINS
Value:
__has_builtin(__builtin_amdgcn_raw_buffer_load_b32) && \
__has_builtin(__builtin_amdgcn_make_buffer_rsrc) && \
__has_builtin(__builtin_amdgcn_raw_buffer_store_b32)

◆ LIKELY

#define LIKELY ( x)
Value:
(__builtin_expect(!!(x), 1))

Typedef Documentation

◆ as3_uint32_ptr