vllm.v1.attention.backends.mla.rocm_aiter_mla ¶

_copy_page_indices_kernel ¶

_copy_page_indices_kernel(
    page_indices,
    block_table,
    block_table_stride,
    cu_num_blocks,
    BLOCK_SIZE: constexpr,
)

Copy block table rows into a flat page_indices buffer using indptr. Avoids blocking boolean mask indexing (tensor[mask]) which has data-dependent output size and forces sync. This is the same kernel as introduced in backends/flashinfer.py.

Source code in vllm/v1/attention/backends/mla/rocm_aiter_mla.py

@triton.jit
def _copy_page_indices_kernel(
    page_indices,
    block_table,
    block_table_stride,
    cu_num_blocks,
    BLOCK_SIZE: tl.constexpr,
):
    """Copy block table rows into a flat page_indices buffer using indptr.
    Avoids blocking boolean mask indexing (tensor[mask]) which has
    data-dependent output size and forces sync.
    This is the same kernel as introduced in backends/flashinfer.py.
    """
    req_idx = tl.program_id(0)
    row_ptr = block_table + req_idx * block_table_stride
    start_idx = tl.load(cu_num_blocks + req_idx)
    end_idx = tl.load(cu_num_blocks + req_idx + 1)
    num_blocks = end_idx - start_idx

    offset = tl.arange(0, BLOCK_SIZE)
    for i in tl.range(0, num_blocks, BLOCK_SIZE):
        block_ids = tl.load(row_ptr + i + offset, mask=i + offset < num_blocks)
        tl.store(
            page_indices + start_idx + i + offset,
            block_ids,
            mask=i + offset < num_blocks,
        )