Build uploaded using `kernels`.

Browse files

Files changed (8) hide show

build/torch29-cxx11-cu128-x86_64-linux/{_flash_attn3_1d39a44.abi3.so → _flash_attn3_7e34105_dirty.abi3.so} +2 -2
build/torch29-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cu128-x86_64-linux/flash_attn_interface.py +378 -79
build/torch29-cxx11-cu128-x86_64-linux/metadata.json +1 -0
build/torch29-cxx11-cu130-x86_64-linux/{_flash_attn3_1d39a44.abi3.so → _flash_attn3_7e34105_dirty.abi3.so} +2 -2
build/torch29-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cu130-x86_64-linux/flash_attn_interface.py +378 -79
build/torch29-cxx11-cu130-x86_64-linux/metadata.json +1 -0

build/torch29-cxx11-cu128-x86_64-linux/{_flash_attn3_1d39a44.abi3.so → _flash_attn3_7e34105_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ae6550e9ab9efd6ab826e06037a9140a0f0e5e804da695dff4cb532480faae7
-size 779759280

 version https://git-lfs.github.com/spec/v1
+oid sha256:64416ac848e1977a192ae0a8d031036b7f97812be5b7d9d7f5a55a341892c13b
+size 804210888

build/torch29-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn3_1d39a44
-ops = torch.ops._flash_attn3_1d39a44
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn3_1d39a44::{op_name}"

 import torch
+from . import _flash_attn3_7e34105_dirty
+ops = torch.ops._flash_attn3_7e34105_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn3_7e34105_dirty::{op_name}"

build/torch29-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -1,50 +1,79 @@
 # Copyright (c) 2023, Tri Dao.
-from typing import Optional, Union
 import torch
 import torch.nn as nn
 from ._ops import ops as flash_attn_3_cuda
 def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
 def _flash_attn_forward(
-        q,
-        k,
-        v,
-        k_new,
-        v_new,
-        qv,
-        out,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        cu_seqlens_k_new,
-        seqused_q,
-        seqused_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        page_table,
-        kv_batch_idx,
-        leftpad_k,
-        rotary_cos,
-        rotary_sin,
-        seqlens_rotary,
-        q_descale,
-        k_descale,
-        v_descale,
-        softmax_scale,
-        causal,
-        window_size=(-1, -1),
-        attention_chunk=0,
-        softcap=0.0,
-        rotary_interleaved=True,
-        scheduler_metadata=None,
-        num_splits=1,
-        pack_gqa=None,
-        sm_margin=0):
     q, k, k_new, v_new = [maybe_contiguous(x) for x in (q, k, k_new, v_new)]
     v = v.contiguous() if v.stride(-1) != 1 and v.stride(-3) != 1 else v
     cu_seqlens_q, cu_seqlens_k, cu_seqlens_k_new = [
@@ -56,14 +85,14 @@ def _flash_attn_forward(
     ]
     rotary_cos, rotary_sin = [maybe_contiguous(x) for x in (rotary_cos, rotary_sin)]
     seqlens_rotary = maybe_contiguous(seqlens_rotary)
-    out, softmax_lse, *rest = flash_attn_3_cuda.fwd(
         q,
         k,
         v,
         k_new,
         v_new,
         qv,
-        out,
         cu_seqlens_q,
         cu_seqlens_k,
         cu_seqlens_k_new,
@@ -82,8 +111,8 @@ def _flash_attn_forward(
         v_descale,
         softmax_scale,
         causal,
-        window_size[0],
-        window_size[1],
         attention_chunk,
         softcap,
         rotary_interleaved,
@@ -92,59 +121,314 @@ def _flash_attn_forward(
         pack_gqa,
         sm_margin,
     )
-    return out, softmax_lse, *rest
 def _flash_attn_backward(
         dout,
         q,
         k,
         v,
         out,
         softmax_lse,
         cu_seqlens_q,
         cu_seqlens_k,
         sequed_q,
         sequed_k,
         max_seqlen_q,
         max_seqlen_k,
-        dq,
-        dk,
-        dv,
         softmax_scale,
-        causal,
-        window_size=(-1, -1),
-        softcap=0.0,
-        deterministic=False,
-        sm_margin=0,
-):
-    # dq, dk, dv are allocated by us so they should already be contiguous
-    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
-    dq, dk, dv, softmax_d, *rest = flash_attn_3_cuda.bwd(
         dout,
         q,
         k,
         v,
         out,
         softmax_lse,
         dq,
         dk,
         dv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        sequed_q,
-        sequed_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        softmax_scale,
-        causal,
-        window_size[0],
-        window_size[1],
-        softcap,
-        deterministic,
-        sm_margin,
     )
-    return dq, dk, dv, softmax_d
 class FlashAttnQKVPackedFunc(torch.autograd.Function):
@@ -161,6 +445,7 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
         deterministic=False,
         num_heads_q=None,
         sm_margin=0,
     ):
         if softmax_scale is None:
             softmax_scale = qkv.shape[-1] ** (-0.5)
@@ -188,7 +473,8 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
-            window_size=window_size,
             attention_chunk=attention_chunk,
             softcap=softcap,
             sm_margin=sm_margin,
@@ -203,8 +489,7 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
         ctx.deterministic = deterministic
         ctx.ndim = qkv.dim()
         ctx.sm_margin = sm_margin
-        # return out, softmax_lse
-        return out
     @staticmethod
     def backward(ctx, dout, *args):
@@ -235,13 +520,14 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
             dv,
             ctx.softmax_scale,
             ctx.causal,
-            ctx.window_size,
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
         )
         dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
-        return dqkv, None, None, None, None, None, None, None, None, None, None, None
 class FlashAttnFunc(torch.autograd.Function):
@@ -263,6 +549,7 @@ class FlashAttnFunc(torch.autograd.Function):
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -282,7 +569,8 @@ class FlashAttnFunc(torch.autograd.Function):
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
-            window_size=window_size,
             attention_chunk=attention_chunk,
             softcap=softcap,
             num_splits=num_splits,
@@ -298,7 +586,7 @@ class FlashAttnFunc(torch.autograd.Function):
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
-        return out, softmax_lse
     @staticmethod
     def backward(ctx, dout, *args):
@@ -320,7 +608,8 @@ class FlashAttnFunc(torch.autograd.Function):
             dv,
             ctx.softmax_scale,
             ctx.causal,
-            ctx.window_size,
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
@@ -356,6 +645,7 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -379,7 +669,8 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
-            window_size=window_size,
             attention_chunk=attention_chunk,
             softcap=softcap,
             num_splits=num_splits,
@@ -397,7 +688,7 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
-        return out, softmax_lse
     @staticmethod
     def backward(ctx, dout, *args):
@@ -422,7 +713,8 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
             dv,
             ctx.softmax_scale,
             ctx.causal,
-            ctx.window_size,
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
@@ -444,6 +736,7 @@ def flash_attn_qkvpacked_func(
     deterministic=False,
     num_heads_q=None,
     sm_margin=0,
 ):
     """dropout_p should be set to 0.0 during evaluation
     If Q, K, V are already stacked into 1 tensor, this function will be faster than
@@ -490,6 +783,7 @@ def flash_attn_qkvpacked_func(
         deterministic,
         num_heads_q,
         sm_margin,
     )
@@ -508,6 +802,7 @@ def flash_attn_func(
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
@@ -569,6 +864,7 @@ def flash_attn_func(
         pack_gqa,
         deterministic,
         sm_margin,
     )
@@ -593,6 +889,7 @@ def flash_attn_varlen_func(
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
 ):
     return FlashAttnVarlenFunc.apply(
         q,
@@ -615,6 +912,7 @@ def flash_attn_varlen_func(
         pack_gqa,
         deterministic,
         sm_margin,
     )
@@ -700,7 +998,7 @@ def flash_attn_with_kvcache(
         q: (batch_size, seqlen, nheads, headdim)
         k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no page_table,
             or (num_blocks, page_block_size, nheads_k, headdim) if there's a page_table (i.e. paged KV cache)
-            page_block_size must be a multiple of 256.
         v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim_v) if there's no page_table,
             or (num_blocks, page_block_size, nheads_k, headdim_v) if there's a page_table (i.e. paged KV cache)
         k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
@@ -745,7 +1043,7 @@ def flash_attn_with_kvcache(
         softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
     if cache_seqlens is not None and isinstance(cache_seqlens, int):
         cache_seqlens = torch.full(
-            (k_cache.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device
         )
         cache_seqlens = maybe_contiguous(cache_seqlens)
     out, softmax_lse, *rest = _flash_attn_forward(
@@ -772,7 +1070,8 @@ def flash_attn_with_kvcache(
         q_descale, k_descale, v_descale,
         softmax_scale,
         causal=causal,
-        window_size=window_size,
         attention_chunk=attention_chunk,
         softcap=softcap,
         rotary_interleaved=rotary_interleaved,

 # Copyright (c) 2023, Tri Dao.
+from typing import Optional, Union, List, Tuple
 import torch
 import torch.nn as nn
 from ._ops import ops as flash_attn_3_cuda
+from ._ops import add_op_namespace_prefix
 def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+def round_multiple(x, m):
+    return (x + m - 1) // m * m
+def round_up_headdim(head_size: int) -> int:
+    from .flash_attn_config import CONFIG
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM64"]:
+        if head_size <= 64:
+            return 64
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM96"]:
+        if head_size <= 96:
+            return 96
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM128"]:
+        if head_size <= 128:
+            return 128
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM192"]:
+        if head_size <= 192:
+            return 192
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM256"]:
+        if head_size <= 256:
+            return 256
+    return 256
+@torch.library.custom_op(add_op_namespace_prefix("_flash_attn_forward"), mutates_args=(), device_types="cuda")
 def _flash_attn_forward(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_new: Optional[torch.Tensor] = None,
+    v_new: Optional[torch.Tensor] = None,
+    qv: Optional[torch.Tensor] = None,
+    out_: Optional[torch.Tensor] = None,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    cu_seqlens_k_new: Optional[torch.Tensor] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    page_table: Optional[torch.Tensor] = None,
+    kv_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    seqlens_rotary: Optional[torch.Tensor] = None,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    attention_chunk: int = 0,
+    softcap: float = 0.0,
+    rotary_interleaved: bool = True,
+    scheduler_metadata: Optional[torch.Tensor] = None,
+    num_splits: int = 1,
+    pack_gqa: Optional[bool] = None,
+    sm_margin: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, k_new, v_new = [maybe_contiguous(x) for x in (q, k, k_new, v_new)]
     v = v.contiguous() if v.stride(-1) != 1 and v.stride(-3) != 1 else v
     cu_seqlens_q, cu_seqlens_k, cu_seqlens_k_new = [
     ]
     rotary_cos, rotary_sin = [maybe_contiguous(x) for x in (rotary_cos, rotary_sin)]
     seqlens_rotary = maybe_contiguous(seqlens_rotary)
+    out, softmax_lse, out_accum, softmax_lse_accum = flash_attn_3_cuda.fwd(
         q,
         k,
         v,
         k_new,
         v_new,
         qv,
+        out_,
         cu_seqlens_q,
         cu_seqlens_k,
         cu_seqlens_k_new,
         v_descale,
         softmax_scale,
         causal,
+        window_size_left,
+        window_size_right,
         attention_chunk,
         softcap,
         rotary_interleaved,
         pack_gqa,
         sm_margin,
     )
+    if out_accum is None:
+        out_accum = torch.tensor([], device=out.device)
+    if softmax_lse_accum is None:
+        softmax_lse_accum = torch.tensor([], device=out.device)
+    return out, softmax_lse, out_accum, softmax_lse_accum
+@torch.library.register_fake(add_op_namespace_prefix("_flash_attn_forward"))
+def _flash_attn_forward_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_new: Optional[torch.Tensor] = None,
+    v_new: Optional[torch.Tensor] = None,
+    qv: Optional[torch.Tensor] = None,
+    out_: Optional[torch.Tensor] = None,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    cu_seqlens_k_new: Optional[torch.Tensor] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    page_table: Optional[torch.Tensor] = None,
+    kv_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    seqlens_rotary: Optional[torch.Tensor] = None,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    attention_chunk: int = 0,
+    softcap: float = 0.0,
+    rotary_interleaved: bool = True,
+    scheduler_metadata: Optional[torch.Tensor] = None,
+    num_splits: int = 1,
+    pack_gqa: Optional[bool] = None,
+    sm_margin: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Symbolic fake implementation of flash attention forward.
+    Returns tensors with the correct shapes and dtypes without actual computation.
+    """
+    # Determine if we're in varlen mode
+    is_varlen_q = cu_seqlens_q is not None
+    # Get dimensions from query tensor
+    if is_varlen_q:
+        # varlen mode: q is (total_q, num_heads, head_size)
+        total_q, num_heads, head_size = q.shape
+        batch_size = cu_seqlens_q.shape[0] - 1
+        if max_seqlen_q is None:
+            raise ValueError("max_seqlen_q must be provided if cu_seqlens_q is provided")
+        seqlen_q = max_seqlen_q
+    else:
+        # batch mode: q is (batch_size, seqlen_q, num_heads, head_size)
+        batch_size, seqlen_q, num_heads, head_size = q.shape
+        total_q = batch_size * q.shape[1]
+    # Get value head dimension
+    head_size_v = v.shape[-1]
+    # Determine output dtype (FP8 inputs produce BF16 outputs)
+    q_type = q.dtype
+    if q_type == torch.float8_e4m3fn:
+        out_dtype = torch.bfloat16
+    else:
+        out_dtype = q_type
+    # Create output tensor
+    if out_ is not None:
+        # If out_ is provided, _flash_attn_forward becomes non-functional
+        raise TypeError("Tracing (torch.compile/torch.export) with pre-allocated output tensor is not supported.")
+    if is_varlen_q:
+        out = torch.empty((total_q, num_heads, head_size_v), dtype=out_dtype, device=q.device)
+    else:
+        out = torch.empty((batch_size, seqlen_q, num_heads, head_size_v), dtype=out_dtype, device=q.device)
+    # Create softmax_lse tensor
+    if is_varlen_q:
+        softmax_lse = torch.empty((num_heads, total_q), dtype=torch.float32, device=q.device)
+    else:
+        softmax_lse = torch.empty((batch_size, num_heads, seqlen_q), dtype=torch.float32, device=q.device)
+    # TODO(guilhermeleobas): Implement "get_num_splits"
+    # There's an heuristic to compute num_splits when "num_splits <= 0"
+    # assert that num_splits is > 0 for now
+    if num_splits <= 0:
+        raise ValueError(f"tracing (torch.compile/torch.export) with num_splits <= 0 not supported. Got {num_splits=}")
+    if num_splits > 1:
+        if is_varlen_q:
+            out_accum = torch.empty((num_splits, num_heads, total_q, head_size_v), dtype=torch.float32, device=q.device)
+            softmax_lse_accum = torch.empty((num_splits, num_heads, total_q), dtype=torch.float32, device=q.device)
+        else:
+            out_accum = torch.empty((num_splits, batch_size, num_heads, seqlen_q, head_size_v), dtype=torch.float32, device=q.device)
+            softmax_lse_accum = torch.empty((num_splits, batch_size, num_heads, seqlen_q), dtype=torch.float32, device=q.device)
+    else:
+        # Tensors are not set when num_splits < 1
+        out_accum = torch.tensor([], device=out.device)
+        softmax_lse_accum = torch.tensor([], device=out.device)
+    return out, softmax_lse, out_accum, softmax_lse_accum
+@torch.library.custom_op(add_op_namespace_prefix("_flash_attn_backward"), mutates_args=("dq", "dk", "dv"), device_types="cuda")
 def _flash_attn_backward(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    sequed_q: Optional[torch.Tensor] = None,
+    sequed_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+) -> torch.Tensor:
+    # dq, dk, dv are allocated by us so they should already be contiguous
+    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
+    softmax_d, *rest = flash_attn_3_cuda.bwd(
         dout,
         q,
         k,
         v,
         out,
         softmax_lse,
+        dq,
+        dk,
+        dv,
         cu_seqlens_q,
         cu_seqlens_k,
         sequed_q,
         sequed_k,
         max_seqlen_q,
         max_seqlen_k,
         softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        sm_margin,
+    )
+    return softmax_d
+@torch.library.register_fake(add_op_namespace_prefix("_flash_attn_backward"))
+def _flash_attn_backward_fake(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    sequed_q: Optional[torch.Tensor] = None,
+    sequed_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+) -> torch.Tensor:
+    is_varlen_q = cu_seqlens_q is not None
+    is_varlen_k = cu_seqlens_q is not None
+    is_varlen = is_varlen_q or is_varlen_k or sequed_q is not None or sequed_k is not None
+    if not is_varlen_q:
+        batch_size = q.size(0)
+        seqlen_q = q.size(1)
+        seqlen_k = k.size(1)
+        total_q = batch_size * q.size(1)
+    else:
+        batch_size = cu_seqlens_q.size(0) - 1
+        total_q = q.size(0)
+        seqlen_q = max_seqlen_q
+        seqlen_k = max_seqlen_k
+    if window_size_left >= seqlen_k - 1:
+        window_size_left = -1
+    if window_size_right >= seqlen_q - 1:
+        window_size_right = -1
+    if is_causal:
+        window_size_right = 0
+    is_causal = window_size_left < 0 and window_size_right == 0
+    head_size = q.size(-1)
+    head_size_v = v.size(-1)
+    head_size_rounded = round_up_headdim(max(head_size, head_size_v))
+    # Hopper gpus uses cuda compute capabilities 9.0
+    cap = torch.cuda.get_device_capability(q.device)
+    arch = cap[0] * 10 + cap[1]
+    is_local = (window_size_left >= 0 or window_size_right >= 0) and not is_causal
+    if head_size_rounded <= 64:
+        kBlockM_sm90 = 96 if (is_causal and softcap > 0.0) else 128
+    elif head_size_rounded <= 96:
+        kBlockM_sm90 = 64
+    elif head_size_rounded <= 128:
+        kBlockM_sm90 = 64 if (is_causal or is_local or softcap > 0.0) else 80
+    else:
+        kBlockM_sm90 = 64
+    kBlockM_sm80 = 128 if head_size_rounded <= 64 else 64
+    kBlockM_sm86 = 64 if head_size_rounded <= 192 else 32
+    if arch >= 90:
+        kBlockM = kBlockM_sm90
+    elif arch == 86 or arch == 89:
+        kBlockM = kBlockM_sm86
+    else:
+        kBlockM = kBlockM_sm80
+    num_heads = q.shape[-2]
+    seqlen_q_rounded = round_multiple(seqlen_q, kBlockM)
+    total_q_padded_rounded = round_multiple(total_q + batch_size * kBlockM, kBlockM)
+    dq = torch.empty_like(q) if dq is None else dq
+    dk = torch.empty_like(k) if dk is None else dk
+    dv = torch.empty_like(v) if dv is None else dv
+    if not is_varlen:
+        softmax_d = torch.empty((batch_size, num_heads, seqlen_q_rounded), dtype=torch.float32, device=q.device)
+    else:
+        softmax_d = torch.empty((num_heads, total_q_padded_rounded), dtype=torch.float32, device=q.device)
+    return softmax_d
+def setup_context(ctx, inputs, output):
+    q, k, v = inputs[:3]
+    out, softmax_lse, _, _ = output
+    ctx.save_for_backward(q, k, v, out, softmax_lse)
+    ctx.softmax_scale = inputs[-11]
+    ctx.causal = inputs[-10]
+    ctx.window_size = [inputs[-9], inputs[-8]]
+    ctx.attention_chunk = inputs[-7]
+    ctx.softcap = inputs[-6]
+    ctx.sm_margin = inputs[-1]
+def _backward(ctx, dout, *grads):
+    q, k, v, out, softmax_lse = ctx.saved_tensors
+    dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+    _flash_attn_backward(
         dout,
         q,
         k,
         v,
         out,
         softmax_lse,
+        None, None, # cu_seqlens_q, cu_seqlens_k,
+        None, None, # sequed_q, sequed_k,
+        None, None, # max_seqlen_q, max_seqlen_k,
         dq,
         dk,
         dv,
+        ctx.softmax_scale,
+        ctx.causal,
+        ctx.window_size[0],
+        ctx.window_size[1],
+        ctx.softcap,
+        False, # deterministic
+        ctx.sm_margin,
     )
+    return dq, dk, dv, *((None,) * 21)
+_flash_attn_forward.register_autograd(_backward, setup_context=setup_context)
 class FlashAttnQKVPackedFunc(torch.autograd.Function):
         deterministic=False,
         num_heads_q=None,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = qkv.shape[-1] ** (-0.5)
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
+            window_size_left=window_size[0],
+            window_size_right=window_size[1],
             attention_chunk=attention_chunk,
             softcap=softcap,
             sm_margin=sm_margin,
         ctx.deterministic = deterministic
         ctx.ndim = qkv.dim()
         ctx.sm_margin = sm_margin
+        return (out, softmax_lse) if return_softmax else out
     @staticmethod
     def backward(ctx, dout, *args):
             dv,
             ctx.softmax_scale,
             ctx.causal,
+            ctx.window_size[0],
+            ctx.window_size[1],
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
         )
         dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
+        return dqkv, None, None, None, None, None, None, None, None, None, None, None, None
 class FlashAttnFunc(torch.autograd.Function):
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
+            window_size_left=window_size[0],
+            window_size_right=window_size[1],
             attention_chunk=attention_chunk,
             softcap=softcap,
             num_splits=num_splits,
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
+        return (out, softmax_lse) if return_softmax else out
     @staticmethod
     def backward(ctx, dout, *args):
             dv,
             ctx.softmax_scale,
             ctx.causal,
+            ctx.window_size[0],
+            ctx.window_size[1],
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
+            window_size_left=window_size[0],
+            window_size_right=window_size[1],
             attention_chunk=attention_chunk,
             softcap=softcap,
             num_splits=num_splits,
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
+        return (out, softmax_lse) if return_softmax else out
     @staticmethod
     def backward(ctx, dout, *args):
             dv,
             ctx.softmax_scale,
             ctx.causal,
+            ctx.window_size[0],
+            ctx.window_size[1],
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
     deterministic=False,
     num_heads_q=None,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     """dropout_p should be set to 0.0 during evaluation
     If Q, K, V are already stacked into 1 tensor, this function will be faster than
         deterministic,
         num_heads_q,
         sm_margin,
+        return_attn_probs,
     )
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
         pack_gqa,
         deterministic,
         sm_margin,
+        return_attn_probs,
     )
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     return FlashAttnVarlenFunc.apply(
         q,
         pack_gqa,
         deterministic,
         sm_margin,
+        return_attn_probs,
     )
         q: (batch_size, seqlen, nheads, headdim)
         k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no page_table,
             or (num_blocks, page_block_size, nheads_k, headdim) if there's a page_table (i.e. paged KV cache)
+            page_block_size can be arbitrary (e.g, 1, 2, 3, 64, etc.).
         v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim_v) if there's no page_table,
             or (num_blocks, page_block_size, nheads_k, headdim_v) if there's a page_table (i.e. paged KV cache)
         k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
         softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
     if cache_seqlens is not None and isinstance(cache_seqlens, int):
         cache_seqlens = torch.full(
+            (q.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device
         )
         cache_seqlens = maybe_contiguous(cache_seqlens)
     out, softmax_lse, *rest = _flash_attn_forward(
         q_descale, k_descale, v_descale,
         softmax_scale,
         causal=causal,
+        window_size_left=window_size[0],
+        window_size_right=window_size[1],
         attention_chunk=attention_chunk,
         softcap=softcap,
         rotary_interleaved=rotary_interleaved,

build/torch29-cxx11-cu128-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"python-depends":[]}

build/torch29-cxx11-cu130-x86_64-linux/{_flash_attn3_1d39a44.abi3.so → _flash_attn3_7e34105_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be71ff231157cea9508af9835b6ed814f5010ed159b073c74ef660ad5d3885f3
-size 795878440

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b71000e0cda6d5dd2f967eb990bdc6d41db679a868bb9f2f96fdfc6b0caa461
+size 823719112

build/torch29-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _flash_attn3_1d39a44
-ops = torch.ops._flash_attn3_1d39a44
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_flash_attn3_1d39a44::{op_name}"

 import torch
+from . import _flash_attn3_7e34105_dirty
+ops = torch.ops._flash_attn3_7e34105_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_flash_attn3_7e34105_dirty::{op_name}"

build/torch29-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED Viewed

@@ -1,50 +1,79 @@
 # Copyright (c) 2023, Tri Dao.
-from typing import Optional, Union
 import torch
 import torch.nn as nn
 from ._ops import ops as flash_attn_3_cuda
 def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
 def _flash_attn_forward(
-        q,
-        k,
-        v,
-        k_new,
-        v_new,
-        qv,
-        out,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        cu_seqlens_k_new,
-        seqused_q,
-        seqused_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        page_table,
-        kv_batch_idx,
-        leftpad_k,
-        rotary_cos,
-        rotary_sin,
-        seqlens_rotary,
-        q_descale,
-        k_descale,
-        v_descale,
-        softmax_scale,
-        causal,
-        window_size=(-1, -1),
-        attention_chunk=0,
-        softcap=0.0,
-        rotary_interleaved=True,
-        scheduler_metadata=None,
-        num_splits=1,
-        pack_gqa=None,
-        sm_margin=0):
     q, k, k_new, v_new = [maybe_contiguous(x) for x in (q, k, k_new, v_new)]
     v = v.contiguous() if v.stride(-1) != 1 and v.stride(-3) != 1 else v
     cu_seqlens_q, cu_seqlens_k, cu_seqlens_k_new = [
@@ -56,14 +85,14 @@ def _flash_attn_forward(
     ]
     rotary_cos, rotary_sin = [maybe_contiguous(x) for x in (rotary_cos, rotary_sin)]
     seqlens_rotary = maybe_contiguous(seqlens_rotary)
-    out, softmax_lse, *rest = flash_attn_3_cuda.fwd(
         q,
         k,
         v,
         k_new,
         v_new,
         qv,
-        out,
         cu_seqlens_q,
         cu_seqlens_k,
         cu_seqlens_k_new,
@@ -82,8 +111,8 @@ def _flash_attn_forward(
         v_descale,
         softmax_scale,
         causal,
-        window_size[0],
-        window_size[1],
         attention_chunk,
         softcap,
         rotary_interleaved,
@@ -92,59 +121,314 @@ def _flash_attn_forward(
         pack_gqa,
         sm_margin,
     )
-    return out, softmax_lse, *rest
 def _flash_attn_backward(
         dout,
         q,
         k,
         v,
         out,
         softmax_lse,
         cu_seqlens_q,
         cu_seqlens_k,
         sequed_q,
         sequed_k,
         max_seqlen_q,
         max_seqlen_k,
-        dq,
-        dk,
-        dv,
         softmax_scale,
-        causal,
-        window_size=(-1, -1),
-        softcap=0.0,
-        deterministic=False,
-        sm_margin=0,
-):
-    # dq, dk, dv are allocated by us so they should already be contiguous
-    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
-    dq, dk, dv, softmax_d, *rest = flash_attn_3_cuda.bwd(
         dout,
         q,
         k,
         v,
         out,
         softmax_lse,
         dq,
         dk,
         dv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        sequed_q,
-        sequed_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        softmax_scale,
-        causal,
-        window_size[0],
-        window_size[1],
-        softcap,
-        deterministic,
-        sm_margin,
     )
-    return dq, dk, dv, softmax_d
 class FlashAttnQKVPackedFunc(torch.autograd.Function):
@@ -161,6 +445,7 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
         deterministic=False,
         num_heads_q=None,
         sm_margin=0,
     ):
         if softmax_scale is None:
             softmax_scale = qkv.shape[-1] ** (-0.5)
@@ -188,7 +473,8 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
-            window_size=window_size,
             attention_chunk=attention_chunk,
             softcap=softcap,
             sm_margin=sm_margin,
@@ -203,8 +489,7 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
         ctx.deterministic = deterministic
         ctx.ndim = qkv.dim()
         ctx.sm_margin = sm_margin
-        # return out, softmax_lse
-        return out
     @staticmethod
     def backward(ctx, dout, *args):
@@ -235,13 +520,14 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
             dv,
             ctx.softmax_scale,
             ctx.causal,
-            ctx.window_size,
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
         )
         dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
-        return dqkv, None, None, None, None, None, None, None, None, None, None, None
 class FlashAttnFunc(torch.autograd.Function):
@@ -263,6 +549,7 @@ class FlashAttnFunc(torch.autograd.Function):
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -282,7 +569,8 @@ class FlashAttnFunc(torch.autograd.Function):
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
-            window_size=window_size,
             attention_chunk=attention_chunk,
             softcap=softcap,
             num_splits=num_splits,
@@ -298,7 +586,7 @@ class FlashAttnFunc(torch.autograd.Function):
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
-        return out, softmax_lse
     @staticmethod
     def backward(ctx, dout, *args):
@@ -320,7 +608,8 @@ class FlashAttnFunc(torch.autograd.Function):
             dv,
             ctx.softmax_scale,
             ctx.causal,
-            ctx.window_size,
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
@@ -356,6 +645,7 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
@@ -379,7 +669,8 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
-            window_size=window_size,
             attention_chunk=attention_chunk,
             softcap=softcap,
             num_splits=num_splits,
@@ -397,7 +688,7 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
-        return out, softmax_lse
     @staticmethod
     def backward(ctx, dout, *args):
@@ -422,7 +713,8 @@ class FlashAttnVarlenFunc(torch.autograd.Function):
             dv,
             ctx.softmax_scale,
             ctx.causal,
-            ctx.window_size,
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
@@ -444,6 +736,7 @@ def flash_attn_qkvpacked_func(
     deterministic=False,
     num_heads_q=None,
     sm_margin=0,
 ):
     """dropout_p should be set to 0.0 during evaluation
     If Q, K, V are already stacked into 1 tensor, this function will be faster than
@@ -490,6 +783,7 @@ def flash_attn_qkvpacked_func(
         deterministic,
         num_heads_q,
         sm_margin,
     )
@@ -508,6 +802,7 @@ def flash_attn_func(
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
@@ -569,6 +864,7 @@ def flash_attn_func(
         pack_gqa,
         deterministic,
         sm_margin,
     )
@@ -593,6 +889,7 @@ def flash_attn_varlen_func(
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
 ):
     return FlashAttnVarlenFunc.apply(
         q,
@@ -615,6 +912,7 @@ def flash_attn_varlen_func(
         pack_gqa,
         deterministic,
         sm_margin,
     )
@@ -700,7 +998,7 @@ def flash_attn_with_kvcache(
         q: (batch_size, seqlen, nheads, headdim)
         k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no page_table,
             or (num_blocks, page_block_size, nheads_k, headdim) if there's a page_table (i.e. paged KV cache)
-            page_block_size must be a multiple of 256.
         v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim_v) if there's no page_table,
             or (num_blocks, page_block_size, nheads_k, headdim_v) if there's a page_table (i.e. paged KV cache)
         k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
@@ -745,7 +1043,7 @@ def flash_attn_with_kvcache(
         softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
     if cache_seqlens is not None and isinstance(cache_seqlens, int):
         cache_seqlens = torch.full(
-            (k_cache.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device
         )
         cache_seqlens = maybe_contiguous(cache_seqlens)
     out, softmax_lse, *rest = _flash_attn_forward(
@@ -772,7 +1070,8 @@ def flash_attn_with_kvcache(
         q_descale, k_descale, v_descale,
         softmax_scale,
         causal=causal,
-        window_size=window_size,
         attention_chunk=attention_chunk,
         softcap=softcap,
         rotary_interleaved=rotary_interleaved,

 # Copyright (c) 2023, Tri Dao.
+from typing import Optional, Union, List, Tuple
 import torch
 import torch.nn as nn
 from ._ops import ops as flash_attn_3_cuda
+from ._ops import add_op_namespace_prefix
 def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+def round_multiple(x, m):
+    return (x + m - 1) // m * m
+def round_up_headdim(head_size: int) -> int:
+    from .flash_attn_config import CONFIG
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM64"]:
+        if head_size <= 64:
+            return 64
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM96"]:
+        if head_size <= 96:
+            return 96
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM128"]:
+        if head_size <= 128:
+            return 128
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM192"]:
+        if head_size <= 192:
+            return 192
+    if not CONFIG["build_flags"]["FLASHATTENTION_DISABLE_HDIM256"]:
+        if head_size <= 256:
+            return 256
+    return 256
+@torch.library.custom_op(add_op_namespace_prefix("_flash_attn_forward"), mutates_args=(), device_types="cuda")
 def _flash_attn_forward(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_new: Optional[torch.Tensor] = None,
+    v_new: Optional[torch.Tensor] = None,
+    qv: Optional[torch.Tensor] = None,
+    out_: Optional[torch.Tensor] = None,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    cu_seqlens_k_new: Optional[torch.Tensor] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    page_table: Optional[torch.Tensor] = None,
+    kv_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    seqlens_rotary: Optional[torch.Tensor] = None,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    attention_chunk: int = 0,
+    softcap: float = 0.0,
+    rotary_interleaved: bool = True,
+    scheduler_metadata: Optional[torch.Tensor] = None,
+    num_splits: int = 1,
+    pack_gqa: Optional[bool] = None,
+    sm_margin: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     q, k, k_new, v_new = [maybe_contiguous(x) for x in (q, k, k_new, v_new)]
     v = v.contiguous() if v.stride(-1) != 1 and v.stride(-3) != 1 else v
     cu_seqlens_q, cu_seqlens_k, cu_seqlens_k_new = [
     ]
     rotary_cos, rotary_sin = [maybe_contiguous(x) for x in (rotary_cos, rotary_sin)]
     seqlens_rotary = maybe_contiguous(seqlens_rotary)
+    out, softmax_lse, out_accum, softmax_lse_accum = flash_attn_3_cuda.fwd(
         q,
         k,
         v,
         k_new,
         v_new,
         qv,
+        out_,
         cu_seqlens_q,
         cu_seqlens_k,
         cu_seqlens_k_new,
         v_descale,
         softmax_scale,
         causal,
+        window_size_left,
+        window_size_right,
         attention_chunk,
         softcap,
         rotary_interleaved,
         pack_gqa,
         sm_margin,
     )
+    if out_accum is None:
+        out_accum = torch.tensor([], device=out.device)
+    if softmax_lse_accum is None:
+        softmax_lse_accum = torch.tensor([], device=out.device)
+    return out, softmax_lse, out_accum, softmax_lse_accum
+@torch.library.register_fake(add_op_namespace_prefix("_flash_attn_forward"))
+def _flash_attn_forward_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_new: Optional[torch.Tensor] = None,
+    v_new: Optional[torch.Tensor] = None,
+    qv: Optional[torch.Tensor] = None,
+    out_: Optional[torch.Tensor] = None,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    cu_seqlens_k_new: Optional[torch.Tensor] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    page_table: Optional[torch.Tensor] = None,
+    kv_batch_idx: Optional[torch.Tensor] = None,
+    leftpad_k: Optional[torch.Tensor] = None,
+    rotary_cos: Optional[torch.Tensor] = None,
+    rotary_sin: Optional[torch.Tensor] = None,
+    seqlens_rotary: Optional[torch.Tensor] = None,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    attention_chunk: int = 0,
+    softcap: float = 0.0,
+    rotary_interleaved: bool = True,
+    scheduler_metadata: Optional[torch.Tensor] = None,
+    num_splits: int = 1,
+    pack_gqa: Optional[bool] = None,
+    sm_margin: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Symbolic fake implementation of flash attention forward.
+    Returns tensors with the correct shapes and dtypes without actual computation.
+    """
+    # Determine if we're in varlen mode
+    is_varlen_q = cu_seqlens_q is not None
+    # Get dimensions from query tensor
+    if is_varlen_q:
+        # varlen mode: q is (total_q, num_heads, head_size)
+        total_q, num_heads, head_size = q.shape
+        batch_size = cu_seqlens_q.shape[0] - 1
+        if max_seqlen_q is None:
+            raise ValueError("max_seqlen_q must be provided if cu_seqlens_q is provided")
+        seqlen_q = max_seqlen_q
+    else:
+        # batch mode: q is (batch_size, seqlen_q, num_heads, head_size)
+        batch_size, seqlen_q, num_heads, head_size = q.shape
+        total_q = batch_size * q.shape[1]
+    # Get value head dimension
+    head_size_v = v.shape[-1]
+    # Determine output dtype (FP8 inputs produce BF16 outputs)
+    q_type = q.dtype
+    if q_type == torch.float8_e4m3fn:
+        out_dtype = torch.bfloat16
+    else:
+        out_dtype = q_type
+    # Create output tensor
+    if out_ is not None:
+        # If out_ is provided, _flash_attn_forward becomes non-functional
+        raise TypeError("Tracing (torch.compile/torch.export) with pre-allocated output tensor is not supported.")
+    if is_varlen_q:
+        out = torch.empty((total_q, num_heads, head_size_v), dtype=out_dtype, device=q.device)
+    else:
+        out = torch.empty((batch_size, seqlen_q, num_heads, head_size_v), dtype=out_dtype, device=q.device)
+    # Create softmax_lse tensor
+    if is_varlen_q:
+        softmax_lse = torch.empty((num_heads, total_q), dtype=torch.float32, device=q.device)
+    else:
+        softmax_lse = torch.empty((batch_size, num_heads, seqlen_q), dtype=torch.float32, device=q.device)
+    # TODO(guilhermeleobas): Implement "get_num_splits"
+    # There's an heuristic to compute num_splits when "num_splits <= 0"
+    # assert that num_splits is > 0 for now
+    if num_splits <= 0:
+        raise ValueError(f"tracing (torch.compile/torch.export) with num_splits <= 0 not supported. Got {num_splits=}")
+    if num_splits > 1:
+        if is_varlen_q:
+            out_accum = torch.empty((num_splits, num_heads, total_q, head_size_v), dtype=torch.float32, device=q.device)
+            softmax_lse_accum = torch.empty((num_splits, num_heads, total_q), dtype=torch.float32, device=q.device)
+        else:
+            out_accum = torch.empty((num_splits, batch_size, num_heads, seqlen_q, head_size_v), dtype=torch.float32, device=q.device)
+            softmax_lse_accum = torch.empty((num_splits, batch_size, num_heads, seqlen_q), dtype=torch.float32, device=q.device)
+    else:
+        # Tensors are not set when num_splits < 1
+        out_accum = torch.tensor([], device=out.device)
+        softmax_lse_accum = torch.tensor([], device=out.device)
+    return out, softmax_lse, out_accum, softmax_lse_accum
+@torch.library.custom_op(add_op_namespace_prefix("_flash_attn_backward"), mutates_args=("dq", "dk", "dv"), device_types="cuda")
 def _flash_attn_backward(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    sequed_q: Optional[torch.Tensor] = None,
+    sequed_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+) -> torch.Tensor:
+    # dq, dk, dv are allocated by us so they should already be contiguous
+    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
+    softmax_d, *rest = flash_attn_3_cuda.bwd(
         dout,
         q,
         k,
         v,
         out,
         softmax_lse,
+        dq,
+        dk,
+        dv,
         cu_seqlens_q,
         cu_seqlens_k,
         sequed_q,
         sequed_k,
         max_seqlen_q,
         max_seqlen_k,
         softmax_scale,
+        is_causal,
+        window_size_left,
+        window_size_right,
+        softcap,
+        deterministic,
+        sm_margin,
+    )
+    return softmax_d
+@torch.library.register_fake(add_op_namespace_prefix("_flash_attn_backward"))
+def _flash_attn_backward_fake(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    sequed_q: Optional[torch.Tensor] = None,
+    sequed_k: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
+    dq: Optional[torch.Tensor] = None,
+    dk: Optional[torch.Tensor] = None,
+    dv: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    is_causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    softcap: float = 0.0,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+) -> torch.Tensor:
+    is_varlen_q = cu_seqlens_q is not None
+    is_varlen_k = cu_seqlens_q is not None
+    is_varlen = is_varlen_q or is_varlen_k or sequed_q is not None or sequed_k is not None
+    if not is_varlen_q:
+        batch_size = q.size(0)
+        seqlen_q = q.size(1)
+        seqlen_k = k.size(1)
+        total_q = batch_size * q.size(1)
+    else:
+        batch_size = cu_seqlens_q.size(0) - 1
+        total_q = q.size(0)
+        seqlen_q = max_seqlen_q
+        seqlen_k = max_seqlen_k
+    if window_size_left >= seqlen_k - 1:
+        window_size_left = -1
+    if window_size_right >= seqlen_q - 1:
+        window_size_right = -1
+    if is_causal:
+        window_size_right = 0
+    is_causal = window_size_left < 0 and window_size_right == 0
+    head_size = q.size(-1)
+    head_size_v = v.size(-1)
+    head_size_rounded = round_up_headdim(max(head_size, head_size_v))
+    # Hopper gpus uses cuda compute capabilities 9.0
+    cap = torch.cuda.get_device_capability(q.device)
+    arch = cap[0] * 10 + cap[1]
+    is_local = (window_size_left >= 0 or window_size_right >= 0) and not is_causal
+    if head_size_rounded <= 64:
+        kBlockM_sm90 = 96 if (is_causal and softcap > 0.0) else 128
+    elif head_size_rounded <= 96:
+        kBlockM_sm90 = 64
+    elif head_size_rounded <= 128:
+        kBlockM_sm90 = 64 if (is_causal or is_local or softcap > 0.0) else 80
+    else:
+        kBlockM_sm90 = 64
+    kBlockM_sm80 = 128 if head_size_rounded <= 64 else 64
+    kBlockM_sm86 = 64 if head_size_rounded <= 192 else 32
+    if arch >= 90:
+        kBlockM = kBlockM_sm90
+    elif arch == 86 or arch == 89:
+        kBlockM = kBlockM_sm86
+    else:
+        kBlockM = kBlockM_sm80
+    num_heads = q.shape[-2]
+    seqlen_q_rounded = round_multiple(seqlen_q, kBlockM)
+    total_q_padded_rounded = round_multiple(total_q + batch_size * kBlockM, kBlockM)
+    dq = torch.empty_like(q) if dq is None else dq
+    dk = torch.empty_like(k) if dk is None else dk
+    dv = torch.empty_like(v) if dv is None else dv
+    if not is_varlen:
+        softmax_d = torch.empty((batch_size, num_heads, seqlen_q_rounded), dtype=torch.float32, device=q.device)
+    else:
+        softmax_d = torch.empty((num_heads, total_q_padded_rounded), dtype=torch.float32, device=q.device)
+    return softmax_d
+def setup_context(ctx, inputs, output):
+    q, k, v = inputs[:3]
+    out, softmax_lse, _, _ = output
+    ctx.save_for_backward(q, k, v, out, softmax_lse)
+    ctx.softmax_scale = inputs[-11]
+    ctx.causal = inputs[-10]
+    ctx.window_size = [inputs[-9], inputs[-8]]
+    ctx.attention_chunk = inputs[-7]
+    ctx.softcap = inputs[-6]
+    ctx.sm_margin = inputs[-1]
+def _backward(ctx, dout, *grads):
+    q, k, v, out, softmax_lse = ctx.saved_tensors
+    dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+    _flash_attn_backward(
         dout,
         q,
         k,
         v,
         out,
         softmax_lse,
+        None, None, # cu_seqlens_q, cu_seqlens_k,
+        None, None, # sequed_q, sequed_k,
+        None, None, # max_seqlen_q, max_seqlen_k,
         dq,
         dk,
         dv,
+        ctx.softmax_scale,
+        ctx.causal,
+        ctx.window_size[0],
+        ctx.window_size[1],
+        ctx.softcap,
+        False, # deterministic
+        ctx.sm_margin,
     )
+    return dq, dk, dv, *((None,) * 21)
+_flash_attn_forward.register_autograd(_backward, setup_context=setup_context)
 class FlashAttnQKVPackedFunc(torch.autograd.Function):
         deterministic=False,
         num_heads_q=None,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = qkv.shape[-1] ** (-0.5)
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
+            window_size_left=window_size[0],
+            window_size_right=window_size[1],
             attention_chunk=attention_chunk,
             softcap=softcap,
             sm_margin=sm_margin,
         ctx.deterministic = deterministic
         ctx.ndim = qkv.dim()
         ctx.sm_margin = sm_margin
+        return (out, softmax_lse) if return_softmax else out
     @staticmethod
     def backward(ctx, dout, *args):
             dv,
             ctx.softmax_scale,
             ctx.causal,
+            ctx.window_size[0],
+            ctx.window_size[1],
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
         )
         dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
+        return dqkv, None, None, None, None, None, None, None, None, None, None, None, None
 class FlashAttnFunc(torch.autograd.Function):
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
+            window_size_left=window_size[0],
+            window_size_right=window_size[1],
             attention_chunk=attention_chunk,
             softcap=softcap,
             num_splits=num_splits,
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
+        return (out, softmax_lse) if return_softmax else out
     @staticmethod
     def backward(ctx, dout, *args):
             dv,
             ctx.softmax_scale,
             ctx.causal,
+            ctx.window_size[0],
+            ctx.window_size[1],
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
         pack_gqa=None,
         deterministic=False,
         sm_margin=0,
+        return_softmax=False,
     ):
         if softmax_scale is None:
             softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
             q_descale, k_descale, v_descale,
             softmax_scale,
             causal=causal,
+            window_size_left=window_size[0],
+            window_size_right=window_size[1],
             attention_chunk=attention_chunk,
             softcap=softcap,
             num_splits=num_splits,
         ctx.softcap = softcap
         ctx.deterministic = deterministic
         ctx.sm_margin = sm_margin
+        return (out, softmax_lse) if return_softmax else out
     @staticmethod
     def backward(ctx, dout, *args):
             dv,
             ctx.softmax_scale,
             ctx.causal,
+            ctx.window_size[0],
+            ctx.window_size[1],
             ctx.softcap,
             ctx.deterministic,
             ctx.sm_margin,
     deterministic=False,
     num_heads_q=None,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     """dropout_p should be set to 0.0 during evaluation
     If Q, K, V are already stacked into 1 tensor, this function will be faster than
         deterministic,
         num_heads_q,
         sm_margin,
+        return_attn_probs,
     )
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
         pack_gqa,
         deterministic,
         sm_margin,
+        return_attn_probs,
     )
     pack_gqa=None,
     deterministic=False,
     sm_margin=0,
+    return_attn_probs=False,
 ):
     return FlashAttnVarlenFunc.apply(
         q,
         pack_gqa,
         deterministic,
         sm_margin,
+        return_attn_probs,
     )
         q: (batch_size, seqlen, nheads, headdim)
         k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no page_table,
             or (num_blocks, page_block_size, nheads_k, headdim) if there's a page_table (i.e. paged KV cache)
+            page_block_size can be arbitrary (e.g, 1, 2, 3, 64, etc.).
         v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim_v) if there's no page_table,
             or (num_blocks, page_block_size, nheads_k, headdim_v) if there's a page_table (i.e. paged KV cache)
         k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
         softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
     if cache_seqlens is not None and isinstance(cache_seqlens, int):
         cache_seqlens = torch.full(
+            (q.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device
         )
         cache_seqlens = maybe_contiguous(cache_seqlens)
     out, softmax_lse, *rest = _flash_attn_forward(
         q_descale, k_descale, v_descale,
         softmax_scale,
         causal=causal,
+        window_size_left=window_size[0],
+        window_size_right=window_size[1],
         attention_chunk=attention_chunk,
         softcap=softcap,
         rotary_interleaved=rotary_interleaved,

build/torch29-cxx11-cu130-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"python-depends":[]}