Vortex
============


Installation
------------

.. code-block:: bash

   git clone -b v1 --recursive https://github.com/Infini-AI-Lab/vortex_torch.git
   cd third_party/sglang
   bash install.sh
   cd ../../
   cd vortex_torch
   pip install -e .

Quick Example
-------------
.. code-block:: python

   @register("custom_sparse_attention")
   class CustomSparseAttention(vFlow):
    
    def __init__(self):
        super().__init__()
        # Indexer-side ops
        self.gemv = GeMV()
        self.output_func = topK()

        # Cache-side ops
        self.reduction = CMean(dim=1)

    def forward_indexer(
        self,
        q: torch.Tensor, # viewed as [1, H_q, D]
        o: torch.Tensor,
        cache: Dict[str, torch.Tensor], # viewed as [S, r, c] depending on create_cache()
        ctx: ContextBase,
    ):  
        q_mean = q.mean(dim=1, keepdim=True)
        score = self.gemv(q_mean, cache["centroids"], ctx=ctx)
        self.output_func(score, o, ctx=ctx)

    def forward_cache(
        self,
        cache: Dict[str, torch.Tensor], # viewed as [B, r, c] depending on create_cache()
        loc: torch.Tensor,
        ctx: ContextBase,
    ):  
        # computation is triggered only when a page is finished
        self.reduction(cache["k"], cache["centroids"], loc=loc, ctx=ctx)

    def create_cache(self, page_size: int, head_dim: int):
        
        return {
            "centroids": (1, head_dim),
        }

   
.. code-block:: python

   llm = sgl.Engine(model_path="Qwen/Qwen3-0.6B", 
                    disable_cuda_graph=False,
                    page_size=16,
                    vortex_topk_val=30,   
                    disable_overlap_schedule=True,  # Mandatory
                    attention_backend="flashinfer", # Mandatory
                    enable_vortex_sparsity=True, # otherwise will compute full attention
                    vortex_page_reserved_bos=1,
                    vortex_page_reserved_eos=1,
                    vortex_layers_skip=list(range(1)), # full attention for layer 0
                    vortex_module_path="path/to/custom_sparse_attention.py" #if not specify, vortex will try to search in vortex_torch.flow.algorithms
                    vortex_module_name="custom_sparse_attention",
                    vortex_max_seq_lens=8192,
                    mem_fraction_static=0.6
                    )

API Reference
-------------

.. autosummary::
   :toctree: api
   :recursive:

   vortex_torch.indexer
   vortex_torch.cache
   vortex_torch.flow