danieldk HF Staff commited on Oct 7, 2025

Commit

2a51f9d

1 Parent(s): e786cb7

Sources moved to GitHub.

Browse files

Files changed (19) hide show

README.md +2 -1
build.toml +0 -49
causal-conv1d/causal_conv1d.cpp +0 -486
causal-conv1d/causal_conv1d.h +0 -81
causal-conv1d/causal_conv1d_bwd.cu +0 -627
causal-conv1d/causal_conv1d_common.h +0 -98
causal-conv1d/causal_conv1d_fwd.cu +0 -399
causal-conv1d/causal_conv1d_update.cu +0 -137
causal-conv1d/static_switch.h +0 -25
flake.lock +0 -168
flake.nix +0 -18
tests/test_causal_conv1d.py +0 -353
torch-ext/causal_conv1d/__init__.py +0 -4
torch-ext/causal_conv1d/causal_conv1d_interface.py +0 -242
torch-ext/causal_conv1d/causal_conv1d_varlen.py +0 -86
torch-ext/causal_conv1d/cpp_functions.py +0 -96
torch-ext/pytorch_shim.h +0 -105
torch-ext/torch_binding.cpp +0 -32
torch-ext/torch_binding.h +0 -39

README.md CHANGED Viewed

@@ -6,5 +6,6 @@ tags:
 ## causal-conv1d
-Causal depthwise conv1d kernel by Tri Dao. Source: https://github.com/Dao-AILab/causal-conv1d/

 ## causal-conv1d
+Causal [depthwise conv1d kernel](https://github.com/Dao-AILab/causal-conv1d/) by Tri Dao.
+Kernel source: https://github.com/huggingface/kernels-community/tree/main/causal-conv1d

build.toml DELETED Viewed

@@ -1,49 +0,0 @@
-[general]
-name = "causal_conv1d"
-universal = false
-[torch]
-src = [
-  "torch-ext/pytorch_shim.h",
-  "torch-ext/torch_binding.cpp",
-  "torch-ext/torch_binding.h"
-]
-[kernel.causal_conv1d]
-backend = "cuda"
-src = [
-  "causal-conv1d/causal_conv1d_bwd.cu",
-  "causal-conv1d/causal_conv1d_common.h",
-  "causal-conv1d/causal_conv1d.cpp",
-  "causal-conv1d/causal_conv1d_fwd.cu",
-  "causal-conv1d/causal_conv1d.h",
-  "causal-conv1d/causal_conv1d_update.cu",
-  "causal-conv1d/static_switch.h",
-]
-include = [ "causal-conv1d" ]
-depends = [ "torch" ]
-#[kernel.causal_conv1d_rocm]
-#backend = "rocm"
-#rocm-archs = [
-#    "gfx906",
-#    "gfx908",
-#    "gfx90a",
-#    "gfx940",
-#    "gfx941",
-#    "gfx942",
-#    "gfx1030",
-#    "gfx1100",
-#    "gfx1101",
-#]
-#src = [
-#  "causal-conv1d/causal_conv1d_bwd.cu",
-#  "causal-conv1d/causal_conv1d_common.h",
-#  "causal-conv1d/causal_conv1d.cpp",
-#  "causal-conv1d/causal_conv1d_fwd.cu",
-#  "causal-conv1d/causal_conv1d.h",
-#  "causal-conv1d/causal_conv1d_update.cu",
-#  "causal-conv1d/static_switch.h",
-#]
-#include = [ "causal-conv1d" ]
-#depends = [ "torch" ]

causal-conv1d/causal_conv1d.cpp DELETED Viewed

@@ -1,486 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024, Tri Dao.
- ******************************************************************************/
-#include <torch/all.h>
-#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 6)
-#include <c10/core/DeviceGuard.h>
-#else
-#include <c10/cuda/CUDAGuard.h>
-#endif
-#include <c10/cuda/CUDAStream.h>
-#include <vector>
-#include "causal_conv1d.h"
-#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
-#define DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)                    \
-    if (ITYPE == at::ScalarType::Half) {                                            \
-        using input_t = at::Half;                                                   \
-        __VA_ARGS__();                                                              \
-    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
-        using input_t = at::BFloat16;                                               \
-        __VA_ARGS__();                                                              \
-    } else if (ITYPE == at::ScalarType::Float)  {                                   \
-        using input_t = float;                                                      \
-        __VA_ARGS__();                                                              \
-    } else {                                                                        \
-        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
-    }
-#define DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(WTYPE, NAME, ...)                     \
-    if (WTYPE == at::ScalarType::Half) {                                             \
-        using weight_t = at::Half;                                                   \
-        __VA_ARGS__();                                                               \
-    } else if (WTYPE == at::ScalarType::BFloat16) {                                  \
-        using weight_t = at::BFloat16;                                               \
-        __VA_ARGS__();                                                               \
-    } else if (WTYPE == at::ScalarType::Float)  {                                    \
-        using weight_t = float;                                                      \
-        __VA_ARGS__();                                                               \
-    } else {                                                                         \
-        AT_ERROR(#NAME, " not implemented for weight type '", toString(WTYPE), "'"); \
-    }
-template<typename input_t, typename weight_t>
-void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
-template <typename input_t, typename weight_t>
-void causal_conv1d_channellast_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
-template<typename input_t, typename weight_t>
-void causal_conv1d_bwd_cuda(ConvParamsBwd &params, cudaStream_t stream);
-template<typename input_t, typename weight_t>
-void causal_conv1d_channellast_bwd_cuda(ConvParamsBwd &params, cudaStream_t stream);
-template<typename input_t, typename weight_t>
-void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream);
-void set_conv_params_fwd(ConvParamsBase &params,
-                         // sizes
-                         const size_t batch,
-                         const size_t dim,
-                         const size_t seqlen,
-                         const size_t width,
-                         // device pointers
-                         const at::Tensor x,
-                         const at::Tensor weight,
-                         const at::Tensor out,
-                         void* bias_ptr,
-                         bool silu_activation) {
-    // Reset the parameters
-    memset(&params, 0, sizeof(params));
-    params.batch = batch;
-    params.dim = dim;
-    params.seqlen = seqlen;
-    params.width = width;
-    params.silu_activation = silu_activation;
-    // Set the pointers and strides.
-    params.x_ptr = x.data_ptr();
-    params.weight_ptr = weight.data_ptr();
-    params.bias_ptr = bias_ptr;
-    params.out_ptr = out.data_ptr();
-    // All stride are in elements, not bytes.
-    params.x_batch_stride = x.stride(0);
-    params.x_c_stride = x.stride(1);
-    params.x_l_stride = x.stride(-1);
-    params.weight_c_stride = weight.stride(0);
-    params.weight_width_stride = weight.stride(1);
-    params.out_batch_stride = out.stride(0);
-    params.out_c_stride = out.stride(1);
-    params.out_l_stride = out.stride(-1);
-}
-void set_conv_params_bwd(ConvParamsBwd &params,
-                         // sizes
-                         const size_t batch,
-                         const size_t dim,
-                         const size_t seqlen,
-                         const size_t width,
-                         // device pointers
-                         const at::Tensor x,
-                         const at::Tensor weight,
-                         void* bias_ptr,
-                         const at::Tensor dout,
-                         const at::Tensor dx,
-                         const at::Tensor dweight,
-                         void* dbias_ptr,
-                         bool silu_activation) {
-    // Pass in "dout" instead of "out", we're not gonna use "out" at all.
-    set_conv_params_fwd(params, batch, dim, seqlen, width,
-                        x, weight, dout, bias_ptr, silu_activation);
-    // Set the pointers and strides.
-    params.dout_ptr = dout.data_ptr();
-    params.dx_ptr = dx.data_ptr();
-    params.dweight_ptr = dweight.data_ptr();
-    params.dbias_ptr = dbias_ptr;
-    // All stride are in elements, not bytes.
-    params.dout_batch_stride = dout.stride(0);
-    params.dout_c_stride = dout.stride(1);
-    params.dout_l_stride = dout.stride(2);
-    params.dweight_c_stride = dweight.stride(0);
-    params.dweight_width_stride = dweight.stride(1);
-    params.dx_batch_stride = dx.stride(0);
-    params.dx_c_stride = dx.stride(1);
-    params.dx_l_stride = dx.stride(2);
-}
-void
-causal_conv1d_fwd(const at::Tensor &x,
-                  const at::Tensor &weight,
-                  const c10::optional<at::Tensor> &bias_,
-                  const c10::optional<at::Tensor> &seq_idx_,
-                  const c10::optional<at::Tensor> &initial_states_,
-                  at::Tensor &out,
-                  c10::optional<at::Tensor> &final_states_out_,
-                  bool silu_activation) {
-    auto input_type = x.scalar_type();
-    auto weight_type = weight.scalar_type();
-    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
-    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
-    TORCH_CHECK(x.is_cuda());
-    TORCH_CHECK(weight.is_cuda());
-    const auto sizes = x.sizes();
-    const int batch_size = sizes[0];
-    const int dim = sizes[1];
-    const int seqlen = sizes[2];
-    const int width = weight.size(-1);
-    CHECK_SHAPE(x, batch_size, dim, seqlen);
-    CHECK_SHAPE(weight, dim, width);
-    TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
-    const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
-    if (is_channel_last) {
-        TORCH_CHECK(dim % 8 == 0, "causal_conv1d only supports channel dimension divisible by 8 for now");
-        TORCH_CHECK(x.stride(2) % 8 == 0 and x.stride(0) % 8 == 0, "causal_conv1d with channel last layout requires strides (x.stride(0) and x.stride(2)) to be multiples of 8");
-    }
-    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
-    if (bias_.has_value()) {
-        auto bias = bias_.value();
-        TORCH_CHECK(bias.scalar_type() == weight_type);
-        TORCH_CHECK(bias.is_cuda());
-        TORCH_CHECK(bias.stride(-1) == 1);
-        CHECK_SHAPE(bias, dim);
-    }
-    if (seq_idx_.has_value()) {
-        TORCH_CHECK(is_channel_last, "seq_idx is only supported for channel last layout");
-        auto seq_idx = seq_idx_.value();
-        TORCH_CHECK(seq_idx.scalar_type() == torch::kInt32);
-        TORCH_CHECK(seq_idx.is_cuda());
-        TORCH_CHECK(seq_idx.is_contiguous());
-        CHECK_SHAPE(seq_idx, batch_size, seqlen);
-    }
-    ConvParamsBase params;
-    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
-                        bias_.has_value() ? bias_.value().data_ptr() : nullptr,
-                        silu_activation);
-    if (seq_idx_.has_value()) {
-        params.seq_idx_ptr = seq_idx_.value().data_ptr();
-    } else {
-        params.seq_idx_ptr = nullptr;
-    }
-    if (initial_states_.has_value()) {
-        TORCH_CHECK(is_channel_last, "initial_states is only supported for channel last layout");
-        auto initial_states = initial_states_.value();
-        TORCH_CHECK(initial_states.scalar_type() == input_type);
-        TORCH_CHECK(initial_states.is_cuda());
-        CHECK_SHAPE(initial_states, batch_size, dim, width - 1);
-        TORCH_CHECK(initial_states.stride(1) == 1);
-        params.initial_states_ptr = initial_states.data_ptr();
-        params.initial_states_batch_stride = initial_states.stride(0);
-        params.initial_states_c_stride = initial_states.stride(1);
-        params.initial_states_l_stride = initial_states.stride(2);
-    } else {
-        params.initial_states_ptr = nullptr;
-    }
-    if (final_states_out_.has_value()) {
-        TORCH_CHECK(is_channel_last, "final_states is only supported for channel last layout");
-        auto final_states = final_states_out_.value();
-        TORCH_CHECK(final_states.scalar_type() == input_type);
-        TORCH_CHECK(final_states.is_cuda());
-        CHECK_SHAPE(final_states, batch_size, dim, width - 1);
-        TORCH_CHECK(final_states.stride(1) == 1);
-        params.final_states_ptr = final_states.data_ptr();
-        params.final_states_batch_stride = final_states.stride(0);
-        params.final_states_c_stride = final_states.stride(1);
-        params.final_states_l_stride = final_states.stride(2);
-    } else {
-        params.final_states_ptr = nullptr;
-    }
-    // Otherwise the kernel will be launched from cuda:0 device
-#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 6)
-    c10::DeviceGuard device_guard(x.device());
-#else
-    at::cuda::CUDAGuard device_guard{x.device()};
-#endif
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
-        DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(weight.scalar_type(), "causal_conv1d_fwd", [&] {
-            if (!is_channel_last) {
-                causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
-            } else {
-                causal_conv1d_channellast_fwd_cuda<input_t, weight_t>(params, stream);
-            }
-        });
-    });
-}
-void
-causal_conv1d_bwd(const at::Tensor &x,
-                  const at::Tensor &weight,
-                  const c10::optional<at::Tensor> &bias_,
-                  at::Tensor &dout,
-                  const c10::optional<at::Tensor> &seq_idx_,
-                  const c10::optional<at::Tensor> &initial_states_,
-                  const c10::optional<at::Tensor> &dfinal_states_,
-                  at::Tensor &dx,
-                  at::Tensor &dweight,
-                  c10::optional<at::Tensor> &dbias_,
-                  c10::optional<at::Tensor> &dinitial_states_,
-                  bool silu_activation) {
-    auto input_type = x.scalar_type();
-    auto weight_type = weight.scalar_type();
-    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
-    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
-    TORCH_CHECK(x.is_cuda());
-    TORCH_CHECK(weight.is_cuda());
-    TORCH_CHECK(dout.is_cuda());
-    TORCH_CHECK(bias_.has_value() == dbias_.has_value());
-    const auto sizes = x.sizes();
-    const int batch_size = sizes[0];
-    const int dim = sizes[1];
-    const int seqlen = sizes[2];
-    const int width = weight.size(-1);
-    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
-    CHECK_SHAPE(x, batch_size, dim, seqlen);
-    CHECK_SHAPE(weight, dim, width);
-    CHECK_SHAPE(dout, batch_size, dim, seqlen);
-    TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
-    const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
-    if (!is_channel_last && dout.stride(2) != 1) { dout = dout.contiguous(); }
-    if (is_channel_last && dout.stride(1) != 1) { dout = dout.transpose(-1, -2).contiguous().transpose(-1, -2); }
-    if (is_channel_last) {
-        TORCH_CHECK(dim % 8 == 0, "causal_conv1d only supports channel dimension divisible by 8 for now");
-        TORCH_CHECK(x.stride(2) % 8 == 0 and x.stride(0) % 8 == 0, "causal_conv1d with channel last layout requires strides (x.stride(0) and x.stride(2)) to be multiples of 8");
-        TORCH_CHECK(dout.stride(2) % 8 == 0 and dout.stride(0) % 8 == 0, "causal_conv1d with channel last layout requires strides (dout.stride(0) and dout.stride(2)) to be multiples of 8");
-    }
-    if (bias_.has_value()) {
-        auto bias = bias_.value();
-        TORCH_CHECK(bias.scalar_type() == weight_type);
-        TORCH_CHECK(bias.is_cuda());
-        TORCH_CHECK(bias.stride(-1) == 1);
-        CHECK_SHAPE(bias, dim);
-    }
-    if (seq_idx_.has_value()) {
-        TORCH_CHECK(is_channel_last, "seq_idx only supported for channel last layout");
-        auto seq_idx = seq_idx_.value();
-        TORCH_CHECK(seq_idx.scalar_type() == torch::kInt32);
-        TORCH_CHECK(seq_idx.is_cuda());
-        TORCH_CHECK(seq_idx.is_contiguous());
-        CHECK_SHAPE(seq_idx, batch_size, seqlen);
-    }
-    TORCH_CHECK(dx.scalar_type() == input_type);
-    TORCH_CHECK(dx.is_cuda());
-    CHECK_SHAPE(dx, batch_size, dim, seqlen);
-    if (!is_channel_last) { TORCH_CHECK(dx.stride(2) == 1); }
-    if (is_channel_last) { TORCH_CHECK(dx.stride(1) == 1); }
-    // Otherwise the kernel will be launched from cuda:0 device
-#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 6)
-    c10::Device device = x.device();
-    c10::DeviceGuard device_guard(device);
-#else
-    at::cuda::CUDAGuard device_guard{x.device()};
-#endif
-    ConvParamsBwd params;
-    set_conv_params_bwd(params, batch_size, dim, seqlen, width,
-                        x, weight, bias_.has_value() ? bias_.value().data_ptr() : nullptr,
-                        dout, dx, dweight, bias_.has_value() ? dbias_.value().data_ptr() : nullptr,
-                        silu_activation);
-    if (seq_idx_.has_value()) {
-        params.seq_idx_ptr = seq_idx_.value().data_ptr();
-    } else {
-        params.seq_idx_ptr = nullptr;
-    }
-    if (initial_states_.has_value()) {
-        TORCH_CHECK(is_channel_last, "initial_states is only supported for channel last layout");
-        auto initial_states = initial_states_.value();
-        TORCH_CHECK(initial_states.scalar_type() == input_type);
-        TORCH_CHECK(initial_states.is_cuda());
-        CHECK_SHAPE(initial_states, batch_size, dim, width - 1);
-        TORCH_CHECK(initial_states.stride(1) == 1);
-        params.initial_states_ptr = initial_states.data_ptr();
-        params.initial_states_batch_stride = initial_states.stride(0);
-        params.initial_states_c_stride = initial_states.stride(1);
-        params.initial_states_l_stride = initial_states.stride(2);
-    } else {
-        params.initial_states_ptr = nullptr;
-    }
-    if (dfinal_states_.has_value()) {
-        TORCH_CHECK(is_channel_last, "dfinal_states is only supported for channel last layout");
-        auto dfinal_states = dfinal_states_.value();
-        TORCH_CHECK(dfinal_states.scalar_type() == input_type);
-        TORCH_CHECK(dfinal_states.is_cuda());
-        CHECK_SHAPE(dfinal_states, batch_size, dim, width - 1);
-        params.dfinal_states_ptr = dfinal_states.data_ptr();
-        params.dfinal_states_batch_stride = dfinal_states.stride(0);
-        params.dfinal_states_c_stride = dfinal_states.stride(1);
-        params.dfinal_states_l_stride = dfinal_states.stride(2);
-    } else {
-        params.dfinal_states_ptr = nullptr;
-    }
-    if (dinitial_states_.has_value()) {
-        at::Tensor dinitial_states = dinitial_states_.value();
-        TORCH_CHECK(dinitial_states.stride(1) == 1);
-        params.dinitial_states_ptr = dinitial_states.data_ptr();
-        params.dinitial_states_batch_stride = dinitial_states.stride(0);
-        params.dinitial_states_c_stride = dinitial_states.stride(1);
-        params.dinitial_states_l_stride = dinitial_states.stride(2);
-    } else {
-        params.dinitial_states_ptr = nullptr;
-    }
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_bwd", [&] {
-        DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(weight.scalar_type(), "causal_conv1d_bwd", [&] {
-            if (!is_channel_last) {
-                causal_conv1d_bwd_cuda<input_t, weight_t>(params, stream);
-            } else {
-                causal_conv1d_channellast_bwd_cuda<input_t, weight_t>(params, stream);
-            }
-        });
-    });
-}
-void
-causal_conv1d_update(const at::Tensor &x,
-                     const at::Tensor &conv_state,
-                     const at::Tensor &weight,
-                     const c10::optional<at::Tensor> &bias_,
-                     at::Tensor &out,
-                     bool silu_activation,
-                     const c10::optional<at::Tensor> &cache_seqlens_,
-                     const c10::optional<at::Tensor> &conv_state_indices_
-                     ) {
-    auto input_type = x.scalar_type();
-    auto weight_type = weight.scalar_type();
-    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
-    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
-    TORCH_CHECK(conv_state.scalar_type() == input_type);
-    TORCH_CHECK(x.is_cuda());
-    TORCH_CHECK(conv_state.is_cuda());
-    TORCH_CHECK(weight.is_cuda());
-    const auto sizes = x.sizes();
-    const int batch_size = sizes[0];
-    const int dim = sizes[1];
-    const int seqlen = sizes[2];
-    const int width = weight.size(-1);
-    const int conv_state_len = conv_state.size(2);
-    TORCH_CHECK(conv_state_len >= width - 1);
-    CHECK_SHAPE(x, batch_size, dim, seqlen);
-    CHECK_SHAPE(weight, dim, width);
-    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
-    if (bias_.has_value()) {
-        auto bias = bias_.value();
-        TORCH_CHECK(bias.scalar_type() == weight_type);
-        TORCH_CHECK(bias.is_cuda());
-        TORCH_CHECK(bias.stride(-1) == 1);
-        CHECK_SHAPE(bias, dim);
-    }
-    ConvParamsBase params;
-    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
-                        bias_.has_value() ? bias_.value().data_ptr() : nullptr,
-                        silu_activation);
-    params.conv_state_ptr = conv_state.data_ptr();
-    params.conv_state_len = conv_state_len;
-    // All stride are in elements, not bytes.
-    params.conv_state_batch_stride = conv_state.stride(0);
-    params.conv_state_c_stride = conv_state.stride(1);
-    params.conv_state_l_stride = conv_state.stride(2);
-    if (conv_state_indices_.has_value()) {
-        auto conv_state_indices = conv_state_indices_.value();
-        TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
-        TORCH_CHECK(conv_state_indices.is_cuda());
-        TORCH_CHECK(conv_state_indices.stride(0) == 1)
-        CHECK_SHAPE(conv_state_indices, batch_size);
-        int conv_state_entries = conv_state.size(0);
-        CHECK_SHAPE(conv_state, conv_state_entries, dim, conv_state_len);
-        params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
-    } else {
-        CHECK_SHAPE(conv_state, batch_size, dim, conv_state_len);
-        params.conv_state_indices_ptr = nullptr;
-    }
-    if (cache_seqlens_.has_value()) {
-        auto cache_seqlens = cache_seqlens_.value();
-        TORCH_CHECK(cache_seqlens.scalar_type() == torch::kInt32);
-        TORCH_CHECK(cache_seqlens.is_cuda());
-        TORCH_CHECK(cache_seqlens.stride(-1) == 1);
-        CHECK_SHAPE(cache_seqlens, batch_size);
-        params.cache_seqlens = cache_seqlens.data_ptr<int32_t>();
-    } else {
-        params.cache_seqlens = nullptr;
-    }
-    // Otherwise the kernel will be launched from cuda:0 device
-#if TORCH_VERSION_MAJOR > 2 || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 6)
-    c10::Device device = x.device();
-    c10::DeviceGuard device_guard(device);
-#else
-    at::cuda::CUDAGuard device_guard{x.device()};
-#endif
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
-        DISPATCH_WTYPE_FLOAT_AND_HALF_AND_BF16(weight.scalar_type(), "causal_conv1d_update", [&] {
-            causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
-        });
-    });
-}
-/*
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("causal_conv1d_fwd", &causal_conv1d_fwd, "Causal conv1d forward");
-    m.def("causal_conv1d_bwd", &causal_conv1d_bwd, "Causal conv1d backward");
-    m.def("causal_conv1d_update", &causal_conv1d_update, "Causal conv1d update");
-}
-*/

causal-conv1d/causal_conv1d.h DELETED Viewed

@@ -1,81 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024, Tri Dao.
- ******************************************************************************/
-#pragma once
-////////////////////////////////////////////////////////////////////////////////////////////////////
-struct ConvParamsBase {
-    using index_t = uint32_t;
-    int batch, dim, seqlen, width;
-    bool silu_activation;
-    index_t x_batch_stride;
-    index_t x_c_stride;
-    index_t x_l_stride;
-    index_t weight_c_stride;
-    index_t weight_width_stride;
-    index_t out_batch_stride;
-    index_t out_c_stride;
-    index_t out_l_stride;
-    int conv_state_len;
-    index_t conv_state_batch_stride;
-    index_t conv_state_c_stride;
-    index_t conv_state_l_stride;
-    // Common data pointers.
-    void *__restrict__ x_ptr;
-    void *__restrict__ weight_ptr;
-    void *__restrict__ bias_ptr;
-    void *__restrict__ out_ptr;
-    void *__restrict__ conv_state_ptr;
-    int32_t *__restrict__ cache_seqlens;
-    // Only used if the elements of the batch are gathered from a larger buffer,
-    // which may happen for continuous batching.
-    int32_t *__restrict__ conv_state_indices_ptr;
-    void *__restrict__ seq_idx_ptr;
-    // No __restrict__ since initial_states could be the same as final_states.
-    void * initial_states_ptr;
-    index_t initial_states_batch_stride;
-    index_t initial_states_l_stride;
-    index_t initial_states_c_stride;
-    void * final_states_ptr;
-    index_t final_states_batch_stride;
-    index_t final_states_l_stride;
-    index_t final_states_c_stride;
-};
-struct ConvParamsBwd: public ConvParamsBase {
-    index_t dx_batch_stride;
-    index_t dx_c_stride;
-    index_t dx_l_stride;
-    index_t dweight_c_stride;
-    index_t dweight_width_stride;
-    index_t dout_batch_stride;
-    index_t dout_c_stride;
-    index_t dout_l_stride;
-    // Common data pointers.
-    void *__restrict__ dx_ptr;
-    void *__restrict__ dweight_ptr;
-    void *__restrict__ dbias_ptr;
-    void *__restrict__ dout_ptr;
-    void * dinitial_states_ptr;
-    index_t dinitial_states_batch_stride;
-    index_t dinitial_states_l_stride;
-    index_t dinitial_states_c_stride;
-    void * dfinal_states_ptr;
-    index_t dfinal_states_batch_stride;
-    index_t dfinal_states_l_stride;
-    index_t dfinal_states_c_stride;
-};

causal-conv1d/causal_conv1d_bwd.cu DELETED Viewed

@@ -1,627 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024, Tri Dao.
- ******************************************************************************/
-#include <c10/util/BFloat16.h>
-#include <c10/util/Half.h>
-#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
-#ifndef USE_ROCM
-    #include <cub/block/block_load.cuh>
-    #include <cub/block/block_store.cuh>
-    #include <cub/block/block_reduce.cuh>
-#else
-    #include <hipcub/hipcub.hpp>
-    namespace cub = hipcub;
-#endif
-#include "causal_conv1d.h"
-#include "causal_conv1d_common.h"
-#include "static_switch.h"
-template<int kNThreads_, int kWidth_, bool kSiluAct_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
-struct Causal_conv1d_bwd_kernel_traits {
-    using input_t = input_t_;
-    using weight_t = weight_t_;
-    static constexpr int kNThreads = kNThreads_;
-    static constexpr int kWidth = kWidth_;
-    static constexpr bool kSiluAct = kSiluAct_;
-    static constexpr int kNBytes = sizeof(input_t);
-    static_assert(kNBytes == 2 || kNBytes == 4);
-    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
-    static_assert(kWidth <= kNElts);
-    // It's possible that we need to do 2 rounds of exchange if input_t is 16 bits
-    // (since then we'd have 8 values of float, and each round we can exchange 4 floats).
-    static constexpr int kNExchangeRounds = sizeof(float) / sizeof(input_t);
-    static constexpr bool kIsVecLoad = kIsVecLoad_;
-    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
-    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNElts, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
-    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, 1, cub::BLOCK_LOAD_DIRECT>;
-    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNElts, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, 1, cub::BLOCK_STORE_DIRECT>;
-    using BlockReduceFloatT = cub::BlockReduce<float, kNThreads>;
-    static constexpr int kSmemIOSize = kIsVecLoad
-        ? 0
-        : custom_max({sizeof(typename BlockLoadT::TempStorage), sizeof(typename BlockStoreT::TempStorage)});
-    static constexpr int kSmemExchangeSize = kNThreads * kNBytes * kNElts * (!kSiluAct ? 1 : kNExchangeRounds + 1);
-    static constexpr int kSmemSize = custom_max({kSmemExchangeSize,
-            int(sizeof(typename BlockReduceFloatT::TempStorage))}) + (kIsVecLoad ? 0 : kSmemIOSize);
-};
-template<typename Ktraits>
-__global__ __launch_bounds__(Ktraits::kNThreads)
-void causal_conv1d_bwd_kernel(ConvParamsBwd params) {
-    constexpr int kWidth = Ktraits::kWidth;
-    constexpr int kNThreads = Ktraits::kNThreads;
-    constexpr bool kSiluAct = Ktraits::kSiluAct;
-    static constexpr int kNElts = Ktraits::kNElts;
-    constexpr int kNExchangeRounds = Ktraits::kNExchangeRounds;
-    static constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
-    using input_t = typename Ktraits::input_t;
-    using vec_t = typename Ktraits::vec_t;
-    using weight_t = typename Ktraits::weight_t;
-    // Shared memory.
-    extern __shared__ char smem_[];
-    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
-    auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_);
-    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
-    auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
-    vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
-    vec_t *smem_exchange_x = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize) + kNThreads * kNExchangeRounds;
-    auto& smem_reduce_float = *reinterpret_cast<typename Ktraits::BlockReduceFloatT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
-    const int tidx = threadIdx.x;
-    const int batch_id = blockIdx.x;
-    const int dim_id = blockIdx.y;
-    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
-        + dim_id * params.x_c_stride;
-    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + dim_id * params.weight_c_stride;
-    input_t *dout = reinterpret_cast<input_t *>(params.dout_ptr) + batch_id * params.dout_batch_stride
-        + dim_id * params.dout_c_stride;
-    input_t *dx = reinterpret_cast<input_t *>(params.dx_ptr) + batch_id * params.dx_batch_stride
-        + dim_id * params.dx_c_stride;
-    float *dweight = reinterpret_cast<float *>(params.dweight_ptr) + dim_id * params.dweight_c_stride;
-    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[dim_id]);
-    // Thread kNThreads - 1 will load the first elements of the next chunk so we initialize those to 0.
-    if (tidx == 0) {
-        if constexpr (!kSiluAct) {
-            input_t zeros[kNElts] = {0};
-            smem_exchange[0] = reinterpret_cast<vec_t *>(zeros)[0];
-        } else {
-            float zeros[kNElts] = {0};
-            #pragma unroll
-            for (int r = 0; r < kNExchangeRounds; ++r) {
-                smem_exchange[r * kNThreads] = reinterpret_cast<vec_t *>(zeros)[r];
-            }
-        }
-    }
-    float weight_vals[kWidth];
-    #pragma unroll
-    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = weight[i * params.weight_width_stride]; }
-    float dweight_vals[kWidth] = {0};
-    float dbias_val = 0;
-    constexpr int kChunkSize = kNThreads * kNElts;
-    const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize;
-    x += (n_chunks - 1) * kChunkSize;
-    dout += (n_chunks - 1) * kChunkSize;
-    dx += (n_chunks - 1) * kChunkSize;
-    for (int chunk = n_chunks - 1; chunk >= 0; --chunk) {
-        input_t x_vals_load[2 * kNElts] = {0};
-        input_t dout_vals_load[2 * kNElts] = {0};
-        if constexpr(kIsVecLoad) {
-            typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (params.seqlen - chunk * kChunkSize) / kNElts);
-            typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(dout), *reinterpret_cast<vec_t (*)[1]>(&dout_vals_load[0]), (params.seqlen - chunk * kChunkSize) / kNElts);
-        } else {
-            __syncthreads();
-            typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), params.seqlen - chunk * kChunkSize);
-            __syncthreads();
-            typename Ktraits::BlockLoadT(smem_load).Load(dout, *reinterpret_cast<input_t (*)[kNElts]>(&dout_vals_load[0]), params.seqlen - chunk * kChunkSize);
-        }
-        float dout_vals[2 * kNElts], x_vals[2 * kNElts];
-        if constexpr (!kSiluAct) {
-            __syncthreads();
-            // Thread 0 don't write yet, so that thread kNThreads - 1 can read
-            // the first elements of the next chunk.
-            if (tidx > 0) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(dout_vals_load)[0]; }
-            __syncthreads();
-            reinterpret_cast<vec_t *>(dout_vals_load)[1] = smem_exchange[tidx < kNThreads - 1 ? tidx + 1 : 0];
-            __syncthreads();
-            // Now thread 0 can write the first elements of the current chunk.
-            if (tidx == 0) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(dout_vals_load)[0]; }
-            #pragma unroll
-            for (int i = 0; i < 2 * kNElts; ++i) {
-                dout_vals[i] = float(dout_vals_load[i]);
-                x_vals[i] = float(x_vals_load[i]);
-            }
-        } else {
-            if (tidx == 0 && chunk > 0) {
-                if constexpr(kIsVecLoad) {
-                    reinterpret_cast<vec_t *>(x_vals_load)[0] = reinterpret_cast<vec_t *>(x)[-1];
-                } else {
-                    #pragma unroll
-                    for (int i = 0; i < kNElts; ++i) {
-                        if (chunk * kChunkSize + i < params.seqlen) { x_vals_load[i] = x[-kNElts + i]; }
-                    }
-                }
-            }
-            __syncthreads();
-            smem_exchange_x[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1];
-            __syncthreads();
-            if (tidx > 0) { reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange_x[tidx - 1]; }
-            #pragma unroll
-            for (int i = 0; i < 2 * kNElts; ++i) { x_vals[i] = float(x_vals_load[i]); }
-            // Recompute the output
-            #pragma unroll
-            for (int i = 0; i < kNElts; ++i) {
-                float out_val = bias_val;
-                #pragma unroll
-                for (int w = 0; w < kWidth; ++w) {
-                    out_val += weight_vals[w] * x_vals[kNElts + i - (kWidth - w - 1)];
-                }
-                float out_sigmoid_val = 1.0f / (1.0f + expf(-out_val));
-                dout_vals[i] = float(dout_vals_load[i]) * out_sigmoid_val
-                               * (1.0f + out_val * (1.0f - out_sigmoid_val));
-            }
-            // Exchange the dout_vals. It's possible that we need to do 2 rounds of exchange
-            // if input_t is 16 bits (since then we'd have 8 values of float)
-            __syncthreads();
-            // Thread 0 don't write yet, so that thread kNThreads - 1 can read
-            // the first elements of the next chunk.
-            if (tidx > 0) {
-                #pragma unroll
-                for (int r = 0; r < kNExchangeRounds; ++r) {
-                    smem_exchange[r * kNThreads + tidx] = reinterpret_cast<vec_t *>(dout_vals)[r];
-                }
-            }
-            __syncthreads();
-            #pragma unroll
-            for (int r = 0; r < kNExchangeRounds; ++r) {
-                reinterpret_cast<vec_t *>(dout_vals)[kNExchangeRounds + r]
-                    = smem_exchange[r * kNThreads + (tidx < kNThreads - 1 ? tidx + 1 : 0)];
-            }
-            __syncthreads();
-            // Now thread 0 can write the first elements of the current chunk.
-            if (tidx == 0) {
-                #pragma unroll
-                for (int r = 0; r < kNExchangeRounds; ++r) {
-                    smem_exchange[r * kNThreads + tidx] = reinterpret_cast<vec_t *>(dout_vals)[r];
-                }
-            }
-        }
-        dout -= kChunkSize;
-        x -= kChunkSize;
-        #pragma unroll
-        for (int i = 0; i < kNElts; ++i) { dbias_val += dout_vals[i]; }
-        float dx_vals[kNElts] = {0};
-        #pragma unroll
-        for (int i = 0; i < kNElts; ++i) {
-            #pragma unroll
-            for (int w = 0; w < kWidth; ++w) {
-                dx_vals[i] += weight_vals[w] * dout_vals[i + kWidth - w - 1];
-            }
-        }
-        input_t dx_vals_store[kNElts];
-        #pragma unroll
-        for (int i = 0; i < kNElts; ++i) { dx_vals_store[i] = dx_vals[i]; }
-        if constexpr(kIsVecLoad) {
-            typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(dx), reinterpret_cast<vec_t (&)[1]>(dx_vals_store), (params.seqlen - chunk * kChunkSize) / kNElts);
-        } else {
-            typename Ktraits::BlockStoreT(smem_store).Store(dx, dx_vals_store, params.seqlen - chunk * kChunkSize);
-        }
-        dx -= kChunkSize;
-        #pragma unroll
-        for (int w = 0; w < kWidth; ++w) {
-            #pragma unroll
-            for (int i = 0; i < kNElts; ++i) {
-                dweight_vals[w] += x_vals[kNElts + i] * dout_vals[i + kWidth - w - 1];
-            }
-        }
-    }
-    #pragma unroll
-    for (int w = 0; w < kWidth; ++w) {
-        __syncthreads();
-        dweight_vals[w] = typename Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(dweight_vals[w]);
-        if (tidx == 0) {
-            atomicAdd(&reinterpret_cast<float *>(dweight)[w * params.dweight_width_stride], dweight_vals[w]);
-        }
-    }
-    if (params.bias_ptr != nullptr) {
-        __syncthreads();
-        dbias_val = typename Ktraits::BlockReduceFloatT(smem_reduce_float).Sum(dbias_val);
-        if (tidx == 0) {
-            atomicAdd(&reinterpret_cast<float *>(params.dbias_ptr)[dim_id], dbias_val);
-        }
-    }
-}
-template<int kNThreads, int kWidth, typename input_t, typename weight_t>
-void causal_conv1d_bwd_launch(ConvParamsBwd &params, cudaStream_t stream) {
-    static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
-    BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] {
-        BOOL_SWITCH(params.silu_activation, kSiluAct, [&] {
-            using Ktraits = Causal_conv1d_bwd_kernel_traits<kNThreads, kWidth, kSiluAct, kIsVecLoad, input_t, weight_t>;
-            constexpr int kSmemSize = Ktraits::kSmemSize;
-            dim3 grid(params.batch, params.dim);
-            auto kernel = &causal_conv1d_bwd_kernel<Ktraits>;
-            if (kSmemSize >= 48 * 1024) {
-                #ifndef USE_ROCM
-                C10_CUDA_CHECK(cudaFuncSetAttribute(
-                    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-                #else
-                // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
-                C10_CUDA_CHECK(cudaFuncSetAttribute(
-                    (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-                std::cerr << "Warning (causal_conv1d bwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
-                #endif
-            }
-            kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
-            C10_CUDA_KERNEL_LAUNCH_CHECK();
-        });
-    });
-}
-template<typename input_t, typename weight_t>
-void causal_conv1d_bwd_cuda(ConvParamsBwd &params, cudaStream_t stream) {
-    if (params.width == 2) {
-        causal_conv1d_bwd_launch<128, 2, input_t, weight_t>(params, stream);
-    } else if (params.width == 3) {
-        causal_conv1d_bwd_launch<128, 3, input_t, weight_t>(params, stream);
-    } else if (params.width == 4) {
-        causal_conv1d_bwd_launch<128, 4, input_t, weight_t>(params, stream);
-    }
-}
-template<int kNThreads_, int kWidth_, int kChunkSizeL_, bool kSiluAct_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
-struct Causal_conv1d_channellast_bwd_kernel_traits {
-    // The cache line is 128 bytes, and we try to read 16 bytes per thread.
-    // So we have 8 threads per "row", so 32 or 64 elements in the channel dimension.
-    // That leaves 4 columns per warp, and so 16 columns per block (assuming each block has 128
-    // threads). Each each load is 16 x 32|64 elements in the L x C dimensions.
-    using input_t = input_t_;
-    using weight_t = weight_t_;
-    static constexpr bool kSiluAct = kSiluAct_;
-    static constexpr int kNThreads = kNThreads_;
-    static_assert(kNThreads % 32 == 0);
-    static constexpr int kNWarps = kNThreads / 32;
-    static constexpr int kWidth = kWidth_;
-    static constexpr int kChunkSizeL = kChunkSizeL_;
-    static constexpr int kNBytes = sizeof(input_t);
-    static_assert(kNBytes == 2 || kNBytes == 4);
-    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
-    static constexpr int kNEltsPerRow = 128 / kNBytes;
-    static constexpr int kNThreadsPerRow = kNEltsPerRow / kNElts;  // Always 8 for now
-    static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
-    static constexpr int kNColsPerWarp = 32 / kNThreadsPerRow;  // Always 4 for now
-    static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
-    static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
-    static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
-    static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
-    static constexpr bool kIsVecLoad = kIsVecLoad_;
-    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
-    // using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
-    // using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-    // static constexpr int kSmemSize = std::max({sizeof(typename BlockLoadT::TempStorage),
-    //                                            sizeof(typename BlockStoreT::TempStorage)});
-    // static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
-};
-template<typename Ktraits, bool kHasSeqIdx, bool kHasDfinalStates>
-__global__ __launch_bounds__(Ktraits::kNThreads)
-void causal_conv1d_channellast_bwd_kernel(ConvParamsBwd params) {
-    constexpr int kWidth = Ktraits::kWidth;
-    constexpr int kNThreads = Ktraits::kNThreads;
-    constexpr bool kSiluAct = Ktraits::kSiluAct;
-    constexpr int kNElts = Ktraits::kNElts;
-    constexpr int kNWarp = Ktraits::kNWarps;
-    constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
-    constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
-    constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
-    constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
-    using input_t = typename Ktraits::input_t;
-    using vec_t = typename Ktraits::vec_t;
-    using weight_t = typename Ktraits::weight_t;
-    // Shared memory.
-    __shared__ input_t dout_smem[kChunkSizeL + kWidth - 1][kChunkSizeC + kNElts];
-    __shared__ input_t x_smem[kWidth - 1 + kChunkSizeL + kWidth - 1][kChunkSizeC + kNElts];
-    const int batch_id = blockIdx.x;
-    const int chunk_l_id = blockIdx.y;
-    const int chunk_c_id = blockIdx.z;
-    const int tid = threadIdx.x;
-    const int l_idx = tid / kNThreadsPerC;
-    const int c_idx = tid % kNThreadsPerC;
-    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
-        + (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr)
-        + chunk_c_id * kChunkSizeC * params.weight_c_stride;
-    input_t *dout = reinterpret_cast<input_t *>(params.dout_ptr) + batch_id * params.dout_batch_stride
-        + (chunk_l_id * kChunkSizeL + l_idx) * params.dout_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    input_t *dx = reinterpret_cast<input_t *>(params.dx_ptr) + batch_id * params.dx_batch_stride
-        + (chunk_l_id * kChunkSizeL + l_idx) * params.dx_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    float *dweight = reinterpret_cast<float *>(params.dweight_ptr)
-        + chunk_c_id * kChunkSizeC * params.dweight_c_stride;
-    int *seq_idx = !kHasSeqIdx ? nullptr : reinterpret_cast<int *>(params.seq_idx_ptr)
-        + batch_id * params.seqlen + chunk_l_id * kChunkSizeL;
-    input_t *initial_states = params.initial_states_ptr == nullptr || chunk_l_id > 0 ? nullptr
-        : reinterpret_cast<input_t *>(params.initial_states_ptr) + batch_id * params.initial_states_batch_stride + l_idx * params.initial_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    input_t *dinitial_states = params.dinitial_states_ptr == nullptr || chunk_l_id > 0 ? nullptr
-        : reinterpret_cast<input_t *>(params.dinitial_states_ptr) + batch_id * params.dinitial_states_batch_stride + l_idx * params.dinitial_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    input_t *dfinal_states = params.dfinal_states_ptr == nullptr ? nullptr
-        : reinterpret_cast<input_t *>(params.dfinal_states_ptr) + batch_id * params.dfinal_states_batch_stride + chunk_c_id * kChunkSizeC;
-    #pragma unroll
-    for (int l = 0; l < Ktraits::kNLoads; ++l) {
-        input_t dout_vals_load[kNElts] = {0};
-        input_t x_vals_load[kNElts] = {0};
-        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
-            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            reinterpret_cast<vec_t *>(dout_vals_load)[0] = *reinterpret_cast<vec_t *>(dout + l * kLPerLoad * params.dout_l_stride);
-            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + l * kLPerLoad * params.x_l_stride);
-        }
-        reinterpret_cast<vec_t *>(dout_smem[l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(dout_vals_load)[0];
-        reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
-    }
-    // Load the elements from the previous chunk or next chunk that are needed for convolution.
-    if (l_idx < kWidth - 1) {
-        input_t dout_vals_load[kNElts] = {0};
-        input_t x_vals_load[kNElts] = {0};
-        if ((chunk_l_id + 1) * kChunkSizeL + l_idx < params.seqlen
-            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            reinterpret_cast<vec_t *>(dout_vals_load)[0] = *reinterpret_cast<vec_t *>(dout + kChunkSizeL * params.dout_l_stride);
-        }
-        if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0
-            && chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen
-            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x - (kWidth - 1) * params.x_l_stride);
-        } else if (initial_states != nullptr
-                   && chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < 0
-                   && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(initial_states);
-        }
-        reinterpret_cast<vec_t *>(dout_smem[kChunkSizeL + l_idx])[c_idx] = reinterpret_cast<vec_t *>(dout_vals_load)[0];
-        reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
-    }
-    // Need to load (kWdith - 1) extra x's on the right to recompute the (kChunkSizeL + kWidth - 1) outputs
-    if constexpr (kSiluAct) {
-        if (l_idx < kWidth - 1) {
-            input_t x_vals_load[kNElts] = {0};
-            if ((chunk_l_id + 1) * kChunkSizeL + l_idx < params.seqlen
-                && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-                reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + kChunkSizeL * params.x_l_stride);
-            }
-            reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + kChunkSizeL + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
-        }
-    }
-    __syncthreads();
-    constexpr int kLPerThread = constexpr_min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
-    static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
-    constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
-    static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
-    // kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for simplicity
-    static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
-    static_assert((kLPerThread & (kLPerThread - 1)) == 0);
-    static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
-    static_assert(kNThreadsPerRow <= 32);
-    const int row_idx = tid / kNThreadsPerRow;
-    const int col_idx = tid % kNThreadsPerRow;
-    float bias_val = params.bias_ptr == nullptr || chunk_c_id * kChunkSizeC + row_idx >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
-    float weight_vals[kWidth] = {0};
-    if (chunk_c_id * kChunkSizeC + row_idx < params.dim) {
-        #pragma unroll
-        for (int w = 0; w < kWidth; ++w) {
-            weight_vals[w] = weight[row_idx * params.weight_c_stride + w * params.weight_width_stride];
-        }
-    }
-    float dout_vals[kLPerThread + kWidth - 1];
-    float x_vals[kWidth - 1 + kLPerThread + kWidth - 1];
-    #pragma unroll
-    for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
-        dout_vals[i] = float(dout_smem[col_idx * kLPerThread + i][row_idx]);
-        x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
-    }
-    int seq_idx_thread[kWidth - 1 + kLPerThread + kWidth - 1];
-    if constexpr (kHasSeqIdx) {
-        #pragma unroll
-        for (int i = 0; i < kWidth - 1 + kLPerThread + kWidth - 1; ++i) {
-            const int l_idx = chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i - (kWidth - 1);
-            seq_idx_thread[i] = l_idx >= 0 && l_idx < params.seqlen ? seq_idx[col_idx * kLPerThread + i - (kWidth - 1)] : -1;
-        }
-    }
-    if constexpr (kSiluAct) {  // Recompute the output
-        #pragma unroll
-        for (int i = kWidth - 1 + kLPerThread; i < kWidth - 1 + kLPerThread + kWidth - 1; ++i) {
-            x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
-        }
-        #pragma unroll
-        for (int i = 0; i < kLPerThread + kWidth - 1; ++i) {
-            float out_val = bias_val;
-            const int seq_idx_cur = !kHasSeqIdx ? 0 : seq_idx_thread[i + kWidth - 1];
-            #pragma unroll
-            for (int w = 0; w < kWidth; ++w) {
-                if constexpr (!kHasSeqIdx) {
-                    out_val += weight_vals[w] * x_vals[i + w];
-                } else {
-                    out_val += seq_idx_thread[i + w] == seq_idx_cur ? weight_vals[w] * x_vals[i + w] : 0.f;
-                }
-            }
-            float out_val_sigmoid = 1.f / (1.f + expf(-out_val));
-            dout_vals[i] *= out_val_sigmoid * (1 + out_val * (1 - out_val_sigmoid));
-        }
-    }
-    float dweight_vals[kWidth] = {0};
-    SumOp<float> sum_op;
-    #pragma unroll
-    for (int w = 0; w < kWidth; ++w) {
-        #pragma unroll
-        for (int i = 0; i < kLPerThread; ++i) {
-            if constexpr (!kHasSeqIdx) {
-                dweight_vals[w] += x_vals[i + w] * dout_vals[i];
-            } else {
-                dweight_vals[w] += seq_idx_thread[i + w] == seq_idx_thread[kWidth - 1 + i] ? x_vals[i + w] * dout_vals[i] : 0.f;
-            }
-        }
-        dweight_vals[w] = Allreduce<kNThreadsPerRow>::run(dweight_vals[w], sum_op);
-        if (col_idx == 0 && chunk_c_id * kChunkSizeC + row_idx < params.dim) {
-            atomicAdd(&reinterpret_cast<float *>(dweight)[row_idx * params.dweight_c_stride + w * params.dweight_width_stride], dweight_vals[w]);
-        }
-    }
-    if (params.bias_ptr != nullptr) {
-        float dbias_val = 0.f;
-        for (int i = 0; i < kLPerThread; ++i) { dbias_val += dout_vals[i]; }
-        dbias_val = Allreduce<kNThreadsPerRow>::run(dbias_val, sum_op);
-        if (col_idx == 0 && chunk_c_id * kChunkSizeC + row_idx < params.dim) {
-            atomicAdd(&reinterpret_cast<float *>(params.dbias_ptr)[chunk_c_id * kChunkSizeC + row_idx], dbias_val);
-        }
-    }
-    float dx_vals[kLPerThread] = {0};
-    #pragma unroll
-    for (int i = 0; i < kLPerThread; ++i) {
-        const int seq_idx_cur = !kHasSeqIdx ? 0 : seq_idx_thread[i + kWidth - 1];
-        #pragma unroll
-        for (int w = 0; w < kWidth; ++w) {
-            if constexpr (!kHasSeqIdx) {
-                dx_vals[i] += weight_vals[kWidth - 1 - w] * dout_vals[i + w];
-            } else {
-                dx_vals[i] += seq_idx_thread[kWidth - 1 + i + w] == seq_idx_cur ? weight_vals[kWidth - 1 - w] * dout_vals[i + w] : 0.f;
-            }
-        }
-        // if (dfinal_states != nullptr) {
-        if constexpr (kHasDfinalStates) {
-            if (chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i >= params.seqlen - kWidth + 1
-                && chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i < params.seqlen
-                && chunk_c_id * kChunkSizeC + row_idx < params.dim) {
-                dx_vals[i] += float(dfinal_states[((chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i) - (params.seqlen - kWidth + 1)) * params.dfinal_states_l_stride + row_idx * params.dfinal_states_c_stride]);
-            }
-        }
-    }
-    float dxinit_vals[kWidth - 1] = {0};
-    static_assert(kLPerThread >= kWidth - 1);  // So only threads with col_idx == 0 need to handle dinitial_states
-    if (dinitial_states != nullptr && col_idx == 0) {
-        #pragma unroll
-        for (int i = 0; i < kWidth - 1; ++i) {
-            #pragma unroll
-            for (int w = 0; w < kWidth; ++w) {
-                dxinit_vals[i] += i + w - (kWidth - 1) >= 0 ? weight_vals[kWidth - 1 - w] * dout_vals[i + w - (kWidth - 1)] : 0.f;
-            }
-            // chunk_l_id must be 0 because dinitial_states != nullptr
-            // if (dfinal_states != nullptr) {
-            if constexpr (kHasDfinalStates) {
-                if (i >= params.seqlen) {
-                    dxinit_vals[i] += float(dfinal_states[(i - params.seqlen) * params.dfinal_states_l_stride + row_idx * params.dfinal_states_c_stride]);
-                }
-            }
-        }
-    }
-    __syncthreads();
-    #pragma unroll
-    for (int i = 0; i < kLPerThread; ++i) { x_smem[kWidth - 1 + col_idx * kLPerThread + i][row_idx] = dx_vals[i]; }
-    if (dinitial_states != nullptr && col_idx == 0) {
-        #pragma unroll
-        for (int i = 0; i < kWidth - 1; ++i) { x_smem[i][row_idx] = dxinit_vals[i]; }
-    }
-    __syncthreads();
-    #pragma unroll
-    for (int l = 0; l < Ktraits::kNLoads; ++l) {
-        input_t dx_vals_store[kNElts];
-        reinterpret_cast<vec_t *>(dx_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx];
-        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
-            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            *reinterpret_cast<vec_t *>(dx + l * kLPerLoad * params.dx_l_stride) = reinterpret_cast<vec_t *>(dx_vals_store)[0];
-        }
-    }
-    if (dinitial_states != nullptr
-        && l_idx < kWidth - 1
-        && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-        input_t dxinit_vals_store[kNElts];
-        reinterpret_cast<vec_t *>(dxinit_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx];
-        *reinterpret_cast<vec_t *>(dinitial_states) = reinterpret_cast<vec_t *>(dxinit_vals_store)[0];
-    }
-}
-template<int kNThreads, int kWidth, typename input_t, typename weight_t>
-void causal_conv1d_channellast_bwd_launch(ConvParamsBwd &params, cudaStream_t stream) {
-    BOOL_SWITCH(params.silu_activation, kSiluAct, [&] {
-        BOOL_SWITCH(params.seq_idx_ptr != nullptr, kHasSeqIdx, [&] {
-            BOOL_SWITCH(params.dfinal_states_ptr != nullptr, kHasDfinalStates, [&] {
-                BOOL_SWITCH(params.seqlen <= 128, kChunkSizeL64, [&] {
-                    // kChunkSizeL = 128 is slightly faster than 64 when seqlen is larger
-                    static constexpr int kChunk = kChunkSizeL64 ? 64 : 128;
-                    using Ktraits = Causal_conv1d_channellast_bwd_kernel_traits<kNThreads, kWidth, kChunk, kSiluAct, true, input_t, weight_t>;
-                    // constexpr int kSmemSize = Ktraits::kSmemSize;
-                    constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
-                    constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
-                    const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
-                    const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
-                    dim3 grid(params.batch, n_chunks_L, n_chunks_C);
-                    dim3 block(Ktraits::kNThreads);
-                    auto kernel = &causal_conv1d_channellast_bwd_kernel<Ktraits, kHasSeqIdx, kHasDfinalStates>;
-                    // if (kSmemSize >= 48 * 1024) {
-                    //     C10_CUDA_CHECK(cudaFuncSetAttribute(
-                    //         kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-                    //     }
-                    // kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
-                    kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
-                    C10_CUDA_KERNEL_LAUNCH_CHECK();
-                });
-            });
-        });
-    });
-}
-template<typename input_t, typename weight_t>
-void causal_conv1d_channellast_bwd_cuda(ConvParamsBwd &params, cudaStream_t stream) {
-    if (params.width == 2) {
-        causal_conv1d_channellast_bwd_launch<128, 2, input_t, weight_t>(params, stream);
-    } else if (params.width == 3) {
-        causal_conv1d_channellast_bwd_launch<128, 3, input_t, weight_t>(params, stream);
-    } else if (params.width == 4) {
-        causal_conv1d_channellast_bwd_launch<128, 4, input_t, weight_t>(params, stream);
-    }
-}
-template void causal_conv1d_bwd_cuda<float, float>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_bwd_cuda<at::Half, float>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_bwd_cuda<at::BFloat16, float>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_bwd_cuda<float, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_bwd_cuda<at::Half, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_bwd_cuda<at::BFloat16, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_bwd_cuda<float, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_bwd_cuda<at::Half, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_bwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<float, float>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<at::Half, float>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<at::BFloat16, float>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<float, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<at::Half, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<at::BFloat16, at::Half>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<float, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<at::Half, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);
-template void causal_conv1d_channellast_bwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBwd &params, cudaStream_t stream);

causal-conv1d/causal_conv1d_common.h DELETED Viewed

@@ -1,98 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2023, Tri Dao.
- ******************************************************************************/
-#pragma once
-#ifndef USE_ROCM
-    #include <cuda_bf16.h>
-    template<typename T>
-    __device__ inline T shuffle_xor(T val, int offset) {
-        return __shfl_xor_sync(uint32_t(-1), val, offset);
-    }
-    constexpr size_t custom_max(std::initializer_list<size_t> ilist)
-    {
-        return std::max(ilist);
-    }
-    template<typename T>
-    constexpr T constexpr_min(T a, T b) {
-        return std::min(a, b);
-    }
-#else
-    #include <hip/hip_bf16.h>
-    template<typename T>
-    __device__ inline T shuffle_xor(T val, int offset) {
-        return __shfl_xor(val, offset);
-    }
-    constexpr size_t custom_max(std::initializer_list<size_t> ilist)
-    {
-        return *std::max_element(ilist.begin(), ilist.end());
-    }
-    template<typename T>
-    constexpr T constexpr_min(T a, T b) {
-        return a < b ? a : b;
-    }
-#endif
-#include <cuda_fp16.h>
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<int BYTES> struct BytesToType {};
-template<> struct BytesToType<16> {
-    using Type = uint4;
-    static_assert(sizeof(Type) == 16);
-};
-template<> struct BytesToType<8> {
-    using Type = uint64_t;
-    static_assert(sizeof(Type) == 8);
-};
-template<> struct BytesToType<4> {
-    using Type = uint32_t;
-    static_assert(sizeof(Type) == 4);
-};
-template<> struct BytesToType<2> {
-    using Type = uint16_t;
-    static_assert(sizeof(Type) == 2);
-};
-template<> struct BytesToType<1> {
-    using Type = uint8_t;
-    static_assert(sizeof(Type) == 1);
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-struct SumOp {
-__device__ inline T operator()(T const & x, T const & y) { return x + y; }
-};
-template<int THREADS>
-struct Allreduce {
-    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
-    template<typename T, typename Operator>
-    static __device__ inline T run(T x, Operator &op) {
-        constexpr int OFFSET = THREADS / 2;
-        x = op(x, shuffle_xor(x, OFFSET));
-        return Allreduce<OFFSET>::run(x, op);
-    }
-};
-template<>
-struct Allreduce<2> {
-template<typename T, typename Operator>
-static __device__ inline T run(T x, Operator &op) {
-    x = op(x, shuffle_xor(x, 1));
-    return x;
-}
-};

causal-conv1d/causal_conv1d_fwd.cu DELETED Viewed

@@ -1,399 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2024, Tri Dao.
- ******************************************************************************/
-#include <c10/util/BFloat16.h>
-#include <c10/util/Half.h>
-#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
-#ifndef USE_ROCM
-    #include <cub/block/block_load.cuh>
-    #include <cub/block/block_store.cuh>
-#else
-    #include <hipcub/hipcub.hpp>
-    namespace cub = hipcub;
-#endif
-#include "causal_conv1d.h"
-#include "causal_conv1d_common.h"
-#include "static_switch.h"
-template<int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
-struct Causal_conv1d_fwd_kernel_traits {
-    using input_t = input_t_;
-    using weight_t = weight_t_;
-    static constexpr int kNThreads = kNThreads_;
-    static constexpr int kWidth = kWidth_;
-    static constexpr int kNBytes = sizeof(input_t);
-    static_assert(kNBytes == 2 || kNBytes == 4);
-    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
-    static_assert(kWidth <= kNElts);
-    static constexpr bool kIsVecLoad = kIsVecLoad_;
-    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
-    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNElts, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
-    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, 1, cub::BLOCK_LOAD_DIRECT>;
-    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNElts, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, 1, cub::BLOCK_STORE_DIRECT>;
-    static constexpr int kSmemIOSize = kIsVecLoad
-        ? 0
-        : custom_max({sizeof(typename BlockLoadT::TempStorage), sizeof(typename BlockStoreT::TempStorage)});
-    static constexpr int kSmemExchangeSize = kNThreads * kNBytes * kNElts;
-    static constexpr int kSmemSize = kSmemIOSize + kSmemExchangeSize;
-};
-template<typename Ktraits>
-__global__ __launch_bounds__(Ktraits::kNThreads)
-void causal_conv1d_fwd_kernel(ConvParamsBase params) {
-    constexpr int kWidth = Ktraits::kWidth;
-    constexpr int kNThreads = Ktraits::kNThreads;
-    constexpr int kNElts = Ktraits::kNElts;
-    static constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
-    using input_t = typename Ktraits::input_t;
-    using vec_t = typename Ktraits::vec_t;
-    using weight_t = typename Ktraits::weight_t;
-    // Shared memory.
-    extern __shared__ char smem_[];
-    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
-    auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_);
-    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
-    auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
-    vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
-    const int tidx = threadIdx.x;
-    const int batch_id = blockIdx.x;
-    const int channel_id = blockIdx.y;
-    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
-        + channel_id * params.x_c_stride;
-    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
-    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
-        + channel_id * params.out_c_stride;
-    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
-    // Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
-    if (tidx == 0) {
-        input_t zeros[kNElts] = {0};
-        smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(zeros)[0];
-    }
-    float weight_vals[kWidth];
-    #pragma unroll
-    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
-    constexpr int kChunkSize = kNThreads * kNElts;
-    const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize;
-    for (int chunk = 0; chunk < n_chunks; ++chunk) {
-        input_t x_vals_load[2 * kNElts] = {0};
-        if constexpr(kIsVecLoad) {
-            typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (params.seqlen - chunk * kChunkSize) / kNElts);
-        } else {
-            __syncthreads();
-            typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), params.seqlen - chunk * kChunkSize);
-        }
-        x += kChunkSize;
-        __syncthreads();
-        // Thread kNThreads - 1 don't write yet, so that thread 0 can read
-        // the last elements of the previous chunk.
-        if (tidx < kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
-        __syncthreads();
-        reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[tidx > 0 ? tidx - 1 : kNThreads - 1];
-        __syncthreads();
-        // Now thread kNThreads - 1 can write the last elements of the current chunk.
-        if (tidx == kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
-        float x_vals[2 * kNElts];
-        #pragma unroll
-        for (int i = 0; i < 2 * kNElts; ++i) { x_vals[i] = float(x_vals_load[i]); }
-        float out_vals[kNElts];
-        #pragma unroll
-        for (int i = 0; i < kNElts; ++i) {
-            out_vals[i] = bias_val;
-            #pragma unroll
-            for (int w = 0; w < kWidth; ++w) {
-                out_vals[i] += weight_vals[w] * x_vals[kNElts + i - (kWidth - w - 1)];
-            }
-        }
-        if (params.silu_activation) {
-            #pragma unroll
-            for (int i = 0; i < kNElts; ++i) {
-                out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i]));
-            }
-        }
-        input_t out_vals_store[kNElts];
-        #pragma unroll
-        for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
-        if constexpr(kIsVecLoad) {
-            typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (params.seqlen - chunk * kChunkSize) / kNElts);
-        } else {
-            typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, params.seqlen - chunk * kChunkSize);
-        }
-        out += kChunkSize;
-    }
-}
-template<int kNThreads, int kWidth, typename input_t, typename weight_t>
-void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
-    static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
-    BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] {
-        using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
-        constexpr int kSmemSize = Ktraits::kSmemSize;
-        dim3 grid(params.batch, params.dim);
-        auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
-        if (kSmemSize >= 48 * 1024) {
-            #ifndef USE_ROCM
-            C10_CUDA_CHECK(cudaFuncSetAttribute(
-                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-            #else
-            // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
-            C10_CUDA_CHECK(cudaFuncSetAttribute(
-                (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-            std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
-            #endif
-        }
-        kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-    });
-}
-template<typename input_t, typename weight_t>
-void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
-    if (params.width == 2) {
-        causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);
-    } else if (params.width == 3) {
-        causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);
-    } else if (params.width == 4) {
-        causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);
-    }
-}
-template<int kNThreads_, int kWidth_, int kChunkSizeL_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
-struct Causal_conv1d_channellast_fwd_kernel_traits {
-    // The cache line is 128 bytes, and we try to read 16 bytes per thread.
-    // So we have 8 threads per "row", so 32 or 64 elements in the channel dimension.
-    // That leaves 4 columns per warp, and so 16 columns per block (assuming each block has 128
-    // threads). Each each load is 16 x 32|64 elements in the L x C dimensions.
-    using input_t = input_t_;
-    using weight_t = weight_t_;
-    static constexpr int kNThreads = kNThreads_;
-    static_assert(kNThreads % 32 == 0);
-    static constexpr int kNWarps = kNThreads / 32;
-    static constexpr int kWidth = kWidth_;
-    static constexpr int kChunkSizeL = kChunkSizeL_;
-    static constexpr int kNBytes = sizeof(input_t);
-    static_assert(kNBytes == 2 || kNBytes == 4);
-    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
-    static constexpr int kNEltsPerRow = 128 / kNBytes;
-    static constexpr int kNThreadsPerRow = kNEltsPerRow / kNElts;  // Always 8 for now
-    static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
-    static constexpr int kNColsPerWarp = 32 / kNThreadsPerRow;  // Always 4 for now
-    static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
-    static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
-    static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
-    static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
-    static constexpr bool kIsVecLoad = kIsVecLoad_;
-    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
-    // using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
-    // using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-    // static constexpr int kSmemSize = std::max({sizeof(typename BlockLoadT::TempStorage),
-    //                                            sizeof(typename BlockStoreT::TempStorage)});
-    // static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
-};
-template<typename Ktraits, bool kHasSeqIdx>
-__global__ __launch_bounds__(Ktraits::kNThreads)
-void causal_conv1d_channellast_fwd_kernel(ConvParamsBase params) {
-    constexpr int kWidth = Ktraits::kWidth;
-    constexpr int kNThreads = Ktraits::kNThreads;
-    constexpr int kNElts = Ktraits::kNElts;
-    constexpr int kNWarp = Ktraits::kNWarps;
-    constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
-    constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
-    constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
-    constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
-    using input_t = typename Ktraits::input_t;
-    using vec_t = typename Ktraits::vec_t;
-    using weight_t = typename Ktraits::weight_t;
-    // Shared memory.
-    __shared__ input_t x_smem[kWidth - 1 + kChunkSizeL][kChunkSizeC + kNElts];
-    const int batch_id = blockIdx.x;
-    const int chunk_l_id = blockIdx.y;
-    const int chunk_c_id = blockIdx.z;
-    const int tid = threadIdx.x;
-    const int l_idx = tid / kNThreadsPerC;
-    const int c_idx = tid % kNThreadsPerC;
-    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
-        + (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr)
-        + chunk_c_id * kChunkSizeC * params.weight_c_stride;
-    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
-        + (chunk_l_id * kChunkSizeL + l_idx) * params.out_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    int *seq_idx = !kHasSeqIdx ? nullptr : reinterpret_cast<int *>(params.seq_idx_ptr)
-        + batch_id * params.seqlen + chunk_l_id * kChunkSizeL;
-    input_t *initial_states = params.initial_states_ptr == nullptr || chunk_l_id > 0 ? nullptr
-        : reinterpret_cast<input_t *>(params.initial_states_ptr) + batch_id * params.initial_states_batch_stride + l_idx * params.initial_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    // The last L-chunk will also have enough info to write to final states, since it also contain a few x values
-    // from the previous L-chunk.
-    input_t *final_states = params.final_states_ptr == nullptr || chunk_l_id < gridDim.y - 1 ? nullptr
-        : reinterpret_cast<input_t *>(params.final_states_ptr) + batch_id * params.final_states_batch_stride + l_idx * params.final_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
-    #pragma unroll
-    for (int l = 0; l < Ktraits::kNLoads; ++l) {
-        input_t x_vals_load[kNElts] = {0};
-        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
-            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + l * kLPerLoad * params.x_l_stride);
-        }
-        reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
-    }
-    // Load the elements from the previous chunk that are needed for convolution.
-    if (l_idx < kWidth - 1) {
-        input_t x_vals_load[kNElts] = {0};
-        if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0
-            && chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen
-            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x - (kWidth - 1) * params.x_l_stride);
-        } else if (initial_states != nullptr
-                   && chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < 0
-                   && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(initial_states);
-        }
-        reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
-    }
-    __syncthreads();
-    if (final_states != nullptr
-        && l_idx < kWidth - 1
-        && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-        // x_smem[0] contains element at index chunk_l_id * kChunkSizeL - (kWidth - 1)
-        // So last few elements (index params.seqlen - kWidth + 1 + l_idx) are stored in x_smem[params.seqlen - kWidth + 1 + l_idx - (chunk_l_id * kChunkSizeL - kWidth + 1)][c_idx]
-        *reinterpret_cast<vec_t *>(final_states) = reinterpret_cast<vec_t *>(x_smem[params.seqlen + l_idx - chunk_l_id * kChunkSizeL])[c_idx];
-    }
-    constexpr int kLPerThread = constexpr_min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
-    static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
-    constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
-    static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
-    // kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for simplicity
-    static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
-    static_assert((kLPerThread & (kLPerThread - 1)) == 0);
-    static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
-    static_assert(kNThreadsPerRow <= 32);
-    const int row_idx = tid / kNThreadsPerRow;
-    const int col_idx = tid % kNThreadsPerRow;
-    float bias_val = params.bias_ptr == nullptr || chunk_c_id * kChunkSizeC + row_idx >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
-    float weight_vals[kWidth] = {0};
-    if (chunk_c_id * kChunkSizeC + row_idx < params.dim) {
-        #pragma unroll
-        for (int w = 0; w < kWidth; ++w) {
-            weight_vals[w] = weight[row_idx * params.weight_c_stride + w * params.weight_width_stride];
-        }
-    }
-    float x_vals[kWidth - 1 + kLPerThread];
-    #pragma unroll
-    for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
-        x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
-    }
-    int seq_idx_thread[kWidth - 1 + kLPerThread];
-    if constexpr (kHasSeqIdx) {
-        #pragma unroll
-        for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
-            seq_idx_thread[i] = chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i - (kWidth - 1) >= 0 ? seq_idx[col_idx * kLPerThread + i - (kWidth - 1)] : -1;
-        }
-    }
-    float out_vals[kLPerThread];
-    #pragma unroll
-    for (int i = 0; i < kLPerThread; ++i) {
-        out_vals[i] = bias_val;
-        const int seq_idx_cur = !kHasSeqIdx ? 0 : seq_idx_thread[i + kWidth - 1];
-        #pragma unroll
-        for (int w = 0; w < kWidth; ++w) {
-            if constexpr (!kHasSeqIdx) {
-                out_vals[i] += weight_vals[w] * x_vals[i + w];
-            } else {
-                out_vals[i] += seq_idx_thread[i + w] == seq_idx_cur ? weight_vals[w] * x_vals[i + w] : 0.f;
-            }
-        }
-        if (params.silu_activation) {out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i])); }
-    }
-    __syncthreads();
-    #pragma unroll
-    for (int i = 0; i < kLPerThread; ++i) { x_smem[col_idx * kLPerThread + i][row_idx] = out_vals[i]; }
-    __syncthreads();
-    #pragma unroll
-    for (int l = 0; l < Ktraits::kNLoads; ++l) {
-        input_t out_vals_store[kNElts];
-        reinterpret_cast<vec_t *>(out_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[l * kLPerLoad + l_idx])[c_idx];
-        if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
-            && chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
-            *reinterpret_cast<vec_t *>(out + l * kLPerLoad * params.out_l_stride) = reinterpret_cast<vec_t *>(out_vals_store)[0];
-        }
-    }
-}
-template<int kNThreads, int kWidth, typename input_t, typename weight_t>
-void causal_conv1d_channellast_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
-    BOOL_SWITCH(params.seq_idx_ptr != nullptr, kHasSeqIdx, [&] {
-        using Ktraits = Causal_conv1d_channellast_fwd_kernel_traits<kNThreads, kWidth, 64, true, input_t, weight_t>;
-        // constexpr int kSmemSize = Ktraits::kSmemSize;
-        constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
-        constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
-        const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
-        const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
-        dim3 grid(params.batch, n_chunks_L, n_chunks_C);
-        dim3 block(Ktraits::kNThreads);
-        auto kernel = &causal_conv1d_channellast_fwd_kernel<Ktraits, kHasSeqIdx>;
-        // if (kSmemSize >= 48 * 1024) {
-        //     C10_CUDA_CHECK(cudaFuncSetAttribute(
-        //         kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-        //     }
-        // kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
-        kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-    });
-}
-template<typename input_t, typename weight_t>
-void causal_conv1d_channellast_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
-    if (params.width == 2) {
-        causal_conv1d_channellast_fwd_launch<128, 2, input_t, weight_t>(params, stream);
-    } else if (params.width == 3) {
-        causal_conv1d_channellast_fwd_launch<128, 3, input_t, weight_t>(params, stream);
-    } else if (params.width == 4) {
-        causal_conv1d_channellast_fwd_launch<128, 4, input_t, weight_t>(params, stream);
-    }
-}
-template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_fwd_cuda<at::Half, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_fwd_cuda<at::BFloat16, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_fwd_cuda<float, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_fwd_cuda<at::BFloat16, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_fwd_cuda<float, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_fwd_cuda<at::Half, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<at::Half, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<float, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<float, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<at::Half, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);

causal-conv1d/causal_conv1d_update.cu DELETED Viewed

@@ -1,137 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2023, Tri Dao.
- ******************************************************************************/
-#include <c10/util/BFloat16.h>
-#include <c10/util/Half.h>
-#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
-#include "causal_conv1d.h"
-#include "causal_conv1d_common.h"
-#include "static_switch.h"
-template<int kNThreads_, int kWidth_, typename input_t_, typename weight_t_>
-struct Causal_conv1d_update_kernel_traits {
-    using input_t = input_t_;
-    using weight_t = weight_t_;
-    static constexpr int kNThreads = kNThreads_;
-    static constexpr int kWidth = kWidth_;
-    static constexpr int kNBytes = sizeof(input_t);
-    static_assert(kNBytes == 2 || kNBytes == 4);
-};
-template<typename Ktraits, bool kIsCircularBuffer>
-__global__ __launch_bounds__(Ktraits::kNThreads)
-void causal_conv1d_update_kernel(ConvParamsBase params) {
-    constexpr int kWidth = Ktraits::kWidth;
-    constexpr int kNThreads = Ktraits::kNThreads;
-    using input_t = typename Ktraits::input_t;
-    using weight_t = typename Ktraits::weight_t;
-    const int tidx = threadIdx.x;
-    const int batch_id = blockIdx.x;
-    const int channel_id = blockIdx.y * kNThreads + tidx;
-    if (channel_id >= params.dim) return;
-    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
-        + channel_id * params.x_c_stride;
-    // If params.conv_state_batch_indices is set, then the conv state is gathered from the conv state tensor
-    // along the batch axis. Otherwise, the conv state coordinate is the same as the batch id.
-    const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr
-        ? batch_id
-        : params.conv_state_indices_ptr[batch_id];
-    input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr)
-        + conv_state_batch_coord * params.conv_state_batch_stride
-        + channel_id * params.conv_state_c_stride;
-    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
-    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
-        + channel_id * params.out_c_stride;
-    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
-    int state_len = params.conv_state_len;
-    int advance_len = params.seqlen;
-    int cache_seqlen = kIsCircularBuffer ? params.cache_seqlens[batch_id] % state_len : 0;
-    int update_idx = cache_seqlen - (kWidth - 1);
-    update_idx = update_idx < 0 ? update_idx + state_len : update_idx;
-    float weight_vals[kWidth] = {0};
-    #pragma unroll
-    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
-    float x_vals[kWidth] = {0};
-    if constexpr (!kIsCircularBuffer) {
-        #pragma unroll 2
-        for (int i = 0; i < state_len - advance_len - (kWidth - 1); ++i) {
-            conv_state[i * params.conv_state_l_stride] = conv_state[(i + advance_len) * params.conv_state_l_stride];
-        }
-        #pragma unroll
-        for (int i = 0; i < kWidth - 1; ++i) {
-            input_t state_val = conv_state[(state_len - (kWidth - 1) + i) * params.conv_state_l_stride];
-            if (i < advance_len + (kWidth - 1) && state_len - advance_len - (kWidth - 1) + i >= 0) {
-                conv_state[(state_len - advance_len - (kWidth - 1) + i) * params.conv_state_l_stride] = state_val;
-            }
-            x_vals[i] = float(state_val);
-        }
-    } else {
-        #pragma unroll
-        for (int i = 0; i < kWidth - 1; ++i, update_idx = update_idx + 1 >= state_len ? update_idx + 1 - state_len : update_idx + 1) {
-            input_t state_val = conv_state[update_idx * params.conv_state_l_stride];
-            x_vals[i] = float(state_val);
-        }
-    }
-    #pragma unroll 2
-    for (int i = 0; i < params.seqlen; ++i) {
-        input_t x_val = x[i * params.x_l_stride];
-        if constexpr (!kIsCircularBuffer) {
-            if (i < advance_len && state_len - advance_len + i >= 0) {
-                conv_state[(state_len - advance_len + i) * params.conv_state_l_stride] = x_val;
-            }
-        } else {
-            conv_state[update_idx * params.conv_state_l_stride] = x_val;
-            ++update_idx;
-            update_idx = update_idx >= state_len ? update_idx - state_len : update_idx;
-        }
-        x_vals[kWidth - 1] = float(x_val);
-        float out_val = bias_val;
-        #pragma unroll
-        for (int j = 0; j < kWidth; ++j) { out_val += weight_vals[j] * x_vals[j]; }
-        if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
-        out[i * params.out_l_stride] = input_t(out_val);
-        // Shift the input buffer by 1
-        #pragma unroll
-        for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = x_vals[i + 1]; }
-    }
-}
-template<int kNThreads, int kWidth, typename input_t, typename weight_t>
-void causal_conv1d_update_launch(ConvParamsBase &params, cudaStream_t stream) {
-    using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
-    dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
-    auto kernel = params.cache_seqlens == nullptr
-        ? &causal_conv1d_update_kernel<Ktraits, false>
-        : &causal_conv1d_update_kernel<Ktraits, true>;
-    kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-template<typename input_t, typename weight_t>
-void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream) {
-    if (params.width == 2) {
-        causal_conv1d_update_launch<64, 2, input_t, weight_t>(params, stream);
-    } else if (params.width == 3) {
-        causal_conv1d_update_launch<64, 3, input_t, weight_t>(params, stream);
-    } else if (params.width == 4) {
-        causal_conv1d_update_launch<64, 4, input_t, weight_t>(params, stream);
-    }
-}
-template void causal_conv1d_update_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_update_cuda<at::Half, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_update_cuda<at::BFloat16, float>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_update_cuda<float, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_update_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_update_cuda<at::BFloat16, at::Half>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_update_cuda<float, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_update_cuda<at::Half, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
-template void causal_conv1d_update_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);

causal-conv1d/static_switch.h DELETED Viewed

@@ -1,25 +0,0 @@
-// Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
-// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
-#pragma once
-/// @param COND       - a boolean expression to switch by
-/// @param CONST_NAME - a name given for the constexpr bool variable.
-/// @param ...       - code to execute for true and false
-///
-/// Usage:
-/// ```
-/// BOOL_SWITCH(flag, BoolConst, [&] {
-///     some_function<BoolConst>(...);
-/// });
-/// ```
-#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
-    [&] {                                                                            \
-        if (COND) {                                                                  \
-            static constexpr bool CONST_NAME = true;                                 \
-            return __VA_ARGS__();                                                    \
-        } else {                                                                     \
-            static constexpr bool CONST_NAME = false;                                \
-            return __VA_ARGS__();                                                    \
-        }                                                                            \
-    }()

flake.lock DELETED Viewed

@@ -1,168 +0,0 @@
-{
-  "nodes": {
-    "flake-compat": {
-      "locked": {
-        "lastModified": 1747046372,
-        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-compat_2": {
-      "locked": {
-        "lastModified": 1747046372,
-        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "flake-utils_2": {
-      "inputs": {
-        "systems": "systems_2"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "hf-nix": {
-      "inputs": {
-        "flake-compat": "flake-compat_2",
-        "flake-utils": "flake-utils_2",
-        "nixpkgs": "nixpkgs"
-      },
-      "locked": {
-        "lastModified": 1759493343,
-        "narHash": "sha256-8fhl0gwMAnOkQbogPIVq+Fha+Yeq52FaRXfwF+F9Q+k=",
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "rev": "b1fc3a18b52447a0f24bc6884418edc5e66082b9",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "type": "github"
-      }
-    },
-    "kernel-builder": {
-      "inputs": {
-        "flake-compat": "flake-compat",
-        "flake-utils": "flake-utils",
-        "hf-nix": "hf-nix",
-        "nixpkgs": [
-          "kernel-builder",
-          "hf-nix",
-          "nixpkgs"
-        ]
-      },
-      "locked": {
-        "lastModified": 1759516823,
-        "narHash": "sha256-UJVvZHtS9c64Dm4iZRaOKWB+VHI7jzcazGH57KXWeg8=",
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "rev": "e13610a05f67b7296be9ead89ad172a0a088a1c3",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1755963616,
-        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
-        "owner": "nixos",
-        "repo": "nixpkgs",
-        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nixos",
-        "ref": "nixos-unstable-small",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "kernel-builder": "kernel-builder"
-      }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    },
-    "systems_2": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}

flake.nix DELETED Viewed

@@ -1,18 +0,0 @@
-{
-  description = "Flake for attention kernels";
-  inputs = {
-    kernel-builder.url = "github:huggingface/kernel-builder";
-  };
-  outputs =
-    {
-      self,
-      kernel-builder,
-    }:
-    kernel-builder.lib.genFlakeOutputs {
-      inherit self;
-      path = ./.;
-      pythonCheckInputs = pkgs: with pkgs; [ einops ];
-    };
-}

tests/test_causal_conv1d.py DELETED Viewed

@@ -1,353 +0,0 @@
-# Copyright (C) 2024, Tri Dao.
-import math
-import torch
-import torch.nn.functional as F
-import pytest
-from einops import rearrange
-from causal_conv1d import causal_conv1d_fn, causal_conv1d_update, causal_conv1d_varlen_states
-from causal_conv1d.causal_conv1d_interface import causal_conv1d_ref
-from causal_conv1d.causal_conv1d_interface import causal_conv1d_update_ref
-from causal_conv1d.causal_conv1d_varlen import causal_conv1d_varlen_states_ref
-@pytest.mark.parametrize("return_final_states", [False, True])
-# @pytest.mark.parametrize("return_final_states", [True])
-@pytest.mark.parametrize("has_initial_states", [False, True])
-# @pytest.mark.parametrize("has_initial_states", [False])
-@pytest.mark.parametrize("channel_last", [False, True])
-# @pytest.mark.parametrize('channel_last', [True])
-@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
-# @pytest.mark.parametrize('itype', [torch.float16])
-@pytest.mark.parametrize("silu_activation", [False, True])
-# @pytest.mark.parametrize('silu_activation', [True])
-@pytest.mark.parametrize("has_bias", [False, True])
-# @pytest.mark.parametrize('has_bias', [True])
-@pytest.mark.parametrize("width", [2, 3, 4])
-# @pytest.mark.parametrize('width', [3])
-@pytest.mark.parametrize(
-    "seqlen", [1, 2, 8, 16, 32, 64, 128, 129, 130, 151, 256, 372, 512, 784, 1024, 1134, 2048, 4096]
-)
-# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
-# @pytest.mark.parametrize('seqlen', [128])
-@pytest.mark.parametrize('dim', [64, 4096 + 32])
-# @pytest.mark.parametrize('dim', [64])
-def test_causal_conv1d(dim, seqlen, width, has_bias, silu_activation, itype, channel_last, has_initial_states, return_final_states):
-    if not channel_last and (has_initial_states or return_final_states):
-        pytest.skip("Only channel_last support initial_states or return_final_states")
-    device = "cuda"
-    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
-    if itype == torch.bfloat16:
-        rtol, atol = 1e-2, 5e-2
-    rtolw, atolw = (1e-3, 1e-3)
-    # set seed
-    torch.random.manual_seed(0)
-    batch = 2
-    # batch = 1
-    if not channel_last:
-        x = torch.randn(batch, 4096 + dim + 64, seqlen, device=device, dtype=itype)[:, 4096:4096 + dim, :].requires_grad_()
-    else:
-        x = rearrange(
-            torch.randn(batch, seqlen, 4096 + dim + 64, device=device, dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s"
-        ).requires_grad_()
-    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
-    if has_bias:
-        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
-    else:
-        bias = None
-    if has_initial_states:
-        initial_states = torch.randn(batch, width - 1, dim, device=device, dtype=itype).transpose(1, 2).requires_grad_()
-    else:
-        initial_states = None
-    x_ref = x.detach().clone().requires_grad_()
-    weight_ref = weight.detach().clone().requires_grad_()
-    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
-    initial_states_ref = initial_states.detach().clone().requires_grad_() if initial_states is not None else None
-    activation = None if not silu_activation else "silu"
-    out = causal_conv1d_fn(x, weight, bias, initial_states=initial_states, return_final_states=return_final_states,
-                           activation=activation)
-    out_ref = causal_conv1d_ref(x_ref, weight_ref, bias_ref, initial_states=initial_states_ref, return_final_states=return_final_states, activation=activation)
-    if return_final_states:
-        out, final_states = out
-        out_ref, final_states_ref = out_ref
-        print(f"Final states max diff: {(final_states - final_states_ref).abs().max().item()}")
-        print(f"Final states mean diff: {(final_states - final_states_ref).abs().mean().item()}")
-        assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol)
-    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
-    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
-    if return_final_states:
-        out += F.sigmoid(final_states).sum(dim=-1, keepdim=True)
-        out_ref += F.sigmoid(final_states_ref).sum(dim=-1, keepdim=True)
-    g = torch.randn_like(out)
-    out.backward(g)
-    out_ref.backward(g)
-    print(f"dx max diff: {(x.grad - x_ref.grad).abs().max().item()}")
-    print(f"dweight max diff: {(weight.grad - weight_ref.grad).abs().max().item()}")
-    if has_bias:
-        print(f"dbias max diff: {(bias.grad - bias_ref.grad).abs().max().item()}")
-    if has_initial_states:
-        print(f"dinitial_states max diff: {(initial_states.grad - initial_states_ref.grad).abs().max().item()}")
-    assert torch.allclose(x.grad, x_ref.grad.to(dtype=itype), rtol=rtol, atol=atol)
-    assert torch.allclose(weight.grad, weight_ref.grad, rtol=rtolw, atol=atolw)
-    if has_bias:
-        assert torch.allclose(bias.grad, bias_ref.grad, rtol=rtolw, atol=atolw)
-    if has_initial_states:
-        assert torch.allclose(initial_states.grad, initial_states_ref.grad.to(dtype=itype), rtol=rtol, atol=atol)
-@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
-# @pytest.mark.parametrize('itype', [torch.float16])
-@pytest.mark.parametrize("silu_activation", [False, True])
-# @pytest.mark.parametrize('silu_activation', [True])
-@pytest.mark.parametrize("has_bias", [False, True])
-# @pytest.mark.parametrize('has_bias', [True])
-@pytest.mark.parametrize("has_cache_seqlens", [False, True])
-# @pytest.mark.parametrize('has_cache_seqlens', [True])
-@pytest.mark.parametrize("seqlen", [1, 4, 5])
-# @pytest.mark.parametrize('seqlen', [4])
-@pytest.mark.parametrize("width", [2, 3, 4])
-# @pytest.mark.parametrize('width', [4])
-@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
-# @pytest.mark.parametrize("dim", [2048])
-def test_causal_conv1d_update(dim, width, seqlen, has_cache_seqlens, has_bias, silu_activation, itype):
-    device = "cuda"
-    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
-    if itype == torch.bfloat16:
-        rtol, atol = 1e-2, 5e-2
-    rtolw, atolw = (1e-3, 1e-3)
-    # set seed
-    torch.random.manual_seed(0)
-    batch = 64
-    # batch = 1
-    # dim = 64
-    x = torch.randn(batch, seqlen, dim, device=device, dtype=itype).transpose(-1, -2)
-    state_len = torch.randint(width - 1, width + 10, (1,)).item()
-    conv_state = torch.randn(batch, state_len, dim, device=device, dtype=itype).transpose(-1, -2)
-    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
-    if has_bias:
-        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
-    else:
-        bias = None
-    conv_state_ref = conv_state.detach().clone()
-    activation = None if not silu_activation else "silu"
-    cache_seqlens = (torch.randint(0, 1024, (batch,), dtype=torch.int32, device=device)
-                     if has_cache_seqlens else None)
-    out = causal_conv1d_update(x, conv_state, weight, bias, activation=activation, cache_seqlens=cache_seqlens)
-    out_ref = causal_conv1d_update_ref(x, conv_state_ref, weight, bias, activation=activation, cache_seqlens=cache_seqlens)
-    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
-    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-    assert torch.equal(conv_state, conv_state_ref)
-    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
-# @pytest.mark.parametrize('itype', [torch.float16])
-@pytest.mark.parametrize("silu_activation", [False, True])
-# @pytest.mark.parametrize('silu_activation', [True])
-@pytest.mark.parametrize("has_bias", [False, True])
-# @pytest.mark.parametrize('has_bias', [True])
-@pytest.mark.parametrize("has_cache_seqlens", [False, True])
-# @pytest.mark.parametrize('has_cache_seqlens', [True])
-@pytest.mark.parametrize("seqlen", [1, 4, 5])
-# @pytest.mark.parametrize('seqlen', [4])
-@pytest.mark.parametrize("width", [2, 3, 4])
-# @pytest.mark.parametrize('width', [4])
-@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
-# @pytest.mark.parametrize("dim", [2048])
-def test_causal_conv1d_update_with_batch_gather(dim, width, seqlen, has_cache_seqlens, has_bias, silu_activation, itype):
-    device = "cuda"
-    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
-    if itype == torch.bfloat16:
-        rtol, atol = 1e-2, 5e-2
-    rtolw, atolw = (1e-3, 1e-3)
-    # set seed
-    torch.random.manual_seed(0)
-    batch = 64
-    # batch = 1
-    # dim = 64
-    x = torch.randn(batch, seqlen, dim, device=device, dtype=itype).transpose(-1, -2)
-    state_len = torch.randint(width - 1, width + 10, (1,)).item()
-    total_entries = 10 * batch
-    conv_state = torch.randn(total_entries, state_len, dim, device=device, dtype=itype).transpose(-1, -2)
-    conv_state_indices = torch.randperm(total_entries)[:batch].to(dtype=torch.int32, device=device)
-    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
-    if has_bias:
-        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
-    else:
-        bias = None
-    conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
-    activation = None if not silu_activation else "silu"
-    cache_seqlens = (torch.randint(0, 1024, (batch,), dtype=torch.int32, device=device)
-                     if has_cache_seqlens else None)
-    out = causal_conv1d_update(x, conv_state, weight, bias, activation=activation,
-                               cache_seqlens=cache_seqlens, conv_state_indices=conv_state_indices)
-    out_ref = causal_conv1d_update_ref(x, conv_state_ref, weight, bias, activation=activation, cache_seqlens=cache_seqlens)
-    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
-    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-    assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
-    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
-# @pytest.mark.parametrize('itype', [torch.float16])
-@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
-# @pytest.mark.parametrize("dim", [2048])
-def test_causal_conv1d_get_states(dim, itype):
-    device = "cuda"
-    # set seed
-    torch.random.manual_seed(0)
-    seqlens = torch.randint(1, 32, (100,), device=device)
-    total_seqlen = seqlens.sum().item()
-    x = torch.randn(total_seqlen, dim, device=device, dtype=itype)
-    cu_seqlens = F.pad(seqlens.cumsum(0), (1, 0))
-    state_len = 20
-    out = causal_conv1d_varlen_states(x, cu_seqlens, state_len)
-    out_ref = causal_conv1d_varlen_states_ref(x, cu_seqlens, state_len)
-    assert torch.equal(out, out_ref)
-# @pytest.mark.parametrize("channel_last", [False, True])
-@pytest.mark.parametrize('channel_last', [True])
-# @pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize('itype', [torch.bfloat16])
-# @pytest.mark.parametrize("silu_activation", [False, True])
-@pytest.mark.parametrize('silu_activation', [True])
-# @pytest.mark.parametrize("has_bias", [False, True])
-@pytest.mark.parametrize('has_bias', [True])
-# @pytest.mark.parametrize("width", [2, 3, 4])
-@pytest.mark.parametrize('width', [4])
-@pytest.mark.parametrize(
-    # "seqlen", [8, 16, 32, 64, 128, 151, 256, 372, 512, 784, 1024, 1134, 2048, 4096]
-    "seqlen", [2048]
-)
-# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
-# @pytest.mark.parametrize('seqlen', [128])
-def test_causal_conv1d_race_condition(seqlen, width, has_bias, silu_activation, itype, channel_last):
-    device = "cuda"
-    # set seed
-    torch.random.manual_seed(0)
-    batch = 2
-    # batch = 1
-    dim = 4096 + 32  # Try dim not divisible by 64
-    # dim = 64
-    if not channel_last:
-        x = torch.randn(batch, 4096 + dim + 64, seqlen, device=device, dtype=itype)[:, 4096:4096 + dim, :].requires_grad_()
-    else:
-        x = rearrange(
-            torch.randn(batch, seqlen, 4096 + dim + 64, device=device, dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s"
-        ).requires_grad_()
-    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
-    if has_bias:
-        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
-    else:
-        bias = None
-    activation = None if not silu_activation else "silu"
-    out0 = causal_conv1d_fn(x, weight, bias, activation=activation)
-    g = torch.randn_like(out0)
-    dx0, dw0, db0 = torch.autograd.grad(out0, (x, weight, bias), g)
-    dw_atol = 1e-4
-    db_atol = 1e-4
-    for i in range(10000):
-        out = causal_conv1d_fn(x, weight, bias, activation=activation)
-        dx, dw, db = torch.autograd.grad(out, (x, weight, bias), g)
-        dw_equal = torch.allclose(dw, dw0, atol=dw_atol)
-        # if not dw_equal:
-        #     breakpoint()
-        if has_bias:
-            db_equal = torch.allclose(db, db0, atol=db_atol)
-            # if not db_equal:
-            #     breakpoint()
-        assert torch.equal(out, out0)
-        assert torch.equal(dx, dx0)
-        assert dw_equal
-        if has_bias:
-            assert dw_equal
-@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
-# @pytest.mark.parametrize('itype', [torch.float16])
-@pytest.mark.parametrize("silu_activation", [False, True])
-# @pytest.mark.parametrize('silu_activation', [False])
-@pytest.mark.parametrize("has_bias", [False, True])
-# @pytest.mark.parametrize('has_bias', [False])
-@pytest.mark.parametrize("width", [2, 3, 4])
-# @pytest.mark.parametrize('width', [2])
-@pytest.mark.parametrize(
-    "seqlen", [8, 16, 32, 64, 128, 151, 256, 372, 512, 784, 1024, 1134, 2048, 4096]
-)
-# @pytest.mark.parametrize('seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
-# @pytest.mark.parametrize('seqlen', [2048])
-@pytest.mark.parametrize('dim', [64, 4096 + 32])
-# @pytest.mark.parametrize('dim', [64])
-def test_causal_conv1d_varlen(dim, seqlen, width, has_bias, silu_activation, itype):
-    device = "cuda"
-    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
-    if itype == torch.bfloat16:
-        rtol, atol = 1e-2, 5e-2
-    rtolw, atolw = (1e-3, 1e-3)
-    # set seed
-    torch.random.manual_seed(seqlen + dim + width)
-    batch = 3
-    seqlens = []
-    for b in range(batch):
-        nsplits = torch.randint(1, 5, (1,)).item()
-        eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
-        seqlens.append(torch.diff(torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])).tolist())
-        assert sum(seqlens[-1]) == seqlen
-        assert all(s > 0 for s in seqlens[-1])
-    # Only support channel_last
-    x = rearrange(
-        torch.randn(batch, seqlen, 4096 + dim + 64, device=device, dtype=itype)[:, :, 4096:4096 + dim], "b s d -> b d s"
-    ).requires_grad_()
-    weight = torch.randn(dim, width, device=device, dtype=torch.float32, requires_grad=True)
-    if has_bias:
-        bias = torch.randn(dim, device=device, dtype=torch.float32, requires_grad=True)
-    else:
-        bias = None
-    seq_idx = torch.stack([torch.cat([torch.full((s,), i, dtype=torch.int32, device=device) for i, s in enumerate(sl)], dim=0)
-                           for sl in seqlens], dim=0)
-    x_ref = x.detach().clone().requires_grad_()
-    weight_ref = weight.detach().clone().requires_grad_()
-    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
-    activation = None if not silu_activation else "silu"
-    out = causal_conv1d_fn(x, weight, bias, seq_idx=seq_idx, activation=activation)
-    out_ref = []
-    for b in range(batch):
-        out_ref_b = []
-        for x_s in torch.split(x_ref[[b]], seqlens[b], dim=2):
-            out_ref_b.append(causal_conv1d_ref(x_s, weight_ref, bias_ref, activation=activation))
-        out_ref.append(torch.cat(out_ref_b, dim=2))
-    out_ref = torch.cat(out_ref, dim=0)
-    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
-    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
-    g = torch.randn_like(out)
-    out_ref.backward(g)
-    out.backward(g)
-    print(f"dx max diff: {(x.grad - x_ref.grad).abs().max().item()}")
-    print(f"dweight max diff: {(weight.grad - weight_ref.grad).abs().max().item()}")
-    if has_bias:
-        print(f"dbias max diff: {(bias.grad - bias_ref.grad).abs().max().item()}")
-    assert torch.allclose(x.grad, x_ref.grad.to(dtype=itype), rtol=rtol, atol=atol)
-    assert torch.allclose(weight.grad, weight_ref.grad, rtol=rtolw, atol=atolw)
-    if has_bias:
-        assert torch.allclose(bias.grad, bias_ref.grad, rtol=rtolw, atol=atolw)

torch-ext/causal_conv1d/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .causal_conv1d_interface import causal_conv1d_fn, causal_conv1d_update
-from .causal_conv1d_varlen import causal_conv1d_varlen_states
-__all__ = ["causal_conv1d_fn", "causal_conv1d_update", "causal_conv1d_varlen_states"]

torch-ext/causal_conv1d/causal_conv1d_interface.py DELETED Viewed

@@ -1,242 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-import torch
-import torch.nn.functional as F
-from .cpp_functions import causal_conv1d_fwd_function, causal_conv1d_bwd_function, causal_conv1d_update_function
-class CausalConv1dFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x,
-        weight,
-        bias=None,
-        seq_idx=None,
-        initial_states=None,
-        return_final_states=False,
-        final_states_out=None,
-        activation=None,
-    ):
-        if activation not in [None, "silu", "swish"]:
-            raise NotImplementedError("activation must be None, silu, or swish")
-        if x.stride(2) != 1 and x.stride(1) != 1:
-            x = x.contiguous()
-        bias = bias.contiguous() if bias is not None else None
-        if seq_idx is not None:
-            assert (
-                initial_states is None
-            ), "initial_states must be None if seq_idx is not None"
-            assert (
-                not return_final_states
-            ), "If seq_idx is not None, we don't return final_states_out"
-        seq_idx = seq_idx.contiguous() if seq_idx is not None else None
-        if initial_states is not None and (
-            initial_states.stride(2) != 1 and initial_states.stride(1) != 1
-        ):
-            initial_states = initial_states.contiguous()
-        if return_final_states:
-            assert (
-                x.stride(1) == 1
-            ), "Only channel-last layout support returning final_states_out"
-            if final_states_out is not None:
-                assert (
-                    final_states_out.stride(2) == 1 or final_states_out.stride(1) == 1
-                )
-            else:
-                batch, dim, seqlen = x.shape
-                width = weight.shape[1]
-                final_states_out = torch.empty(
-                    batch, width - 1, dim, device=x.device, dtype=x.dtype
-                ).transpose(1, 2)
-        else:
-            final_states_out = None
-        ctx.activation = activation in ["silu", "swish"]
-        out = causal_conv1d_fwd_function(
-            x, weight, bias, seq_idx, initial_states, final_states_out, ctx.activation
-        )
-        ctx.save_for_backward(x, weight, bias, seq_idx, initial_states)
-        ctx.return_final_states = return_final_states
-        ctx.return_dinitial_states = (
-            initial_states is not None and initial_states.requires_grad
-        )
-        return out if not return_final_states else (out, final_states_out)
-    @staticmethod
-    def backward(ctx, dout, *args):
-        x, weight, bias, seq_idx, initial_states = ctx.saved_tensors
-        dfinal_states = args[0] if ctx.return_final_states else None
-        if dout.stride(2) != 1 and dout.stride(1) != 1:
-            dout = dout.contiguous()
-        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
-        # backward of conv1d with the backward of chunk).
-        # Here we just pass in None and dx will be allocated in the C++ code.
-        dx, dweight, dbias, dinitial_states = causal_conv1d_bwd_function(
-            x,
-            weight,
-            bias,
-            dout,
-            seq_idx,
-            initial_states,
-            dfinal_states,
-            None,
-            ctx.return_dinitial_states,
-            ctx.activation,
-        )
-        return (
-            dx,
-            dweight,
-            dbias if bias is not None else None,
-            None,
-            dinitial_states if initial_states is not None else None,
-            None,
-            None,
-            None,
-        )
-def causal_conv1d_fn(
-    x,
-    weight,
-    bias=None,
-    seq_idx=None,
-    initial_states=None,
-    return_final_states=False,
-    final_states_out=None,
-    activation=None,
-):
-    """
-    x: (batch, dim, seqlen)
-    weight: (dim, width)
-    bias: (dim,)
-    seq_idx: (batch, seqlen)
-    initial_states: (batch, dim, width - 1)
-    final_states_out: (batch, dim, width - 1), to be written to
-    activation: either None or "silu" or "swish"
-    out: (batch, dim, seqlen)
-    """
-    return CausalConv1dFn.apply(
-        x,
-        weight,
-        bias,
-        seq_idx,
-        initial_states,
-        return_final_states,
-        final_states_out,
-        activation,
-    )
-def causal_conv1d_ref(
-    x,
-    weight,
-    bias=None,
-    initial_states=None,
-    return_final_states=False,
-    final_states_out=None,
-    activation=None,
-):
-    """
-    x: (batch, dim, seqlen)
-    weight: (dim, width)
-    bias: (dim,)
-    initial_states: (batch, dim, width - 1)
-    final_states_out: (batch, dim, width - 1)
-    out: (batch, dim, seqlen)
-    """
-    if activation not in [None, "silu", "swish"]:
-        raise NotImplementedError("activation must be None, silu, or swish")
-    dtype_in = x.dtype
-    x = x.to(weight.dtype)
-    seqlen = x.shape[-1]
-    dim, width = weight.shape
-    if initial_states is None:
-        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
-    else:
-        x = torch.cat([initial_states, x], dim=-1)
-        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
-    out = out[..., :seqlen]
-    if return_final_states:
-        final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
-            dtype_in
-        )  # (batch, dim, width - 1)
-        if final_states_out is not None:
-            final_states_out.copy_(final_states)
-        else:
-            final_states_out = final_states
-    out = (out if activation is None else F.silu(out)).to(dtype=dtype_in)
-    return out if not return_final_states else (out, final_states_out)
-def causal_conv1d_update(x, conv_state, weight, bias=None, activation=None, cache_seqlens=None, conv_state_indices=None):
-    """
-    x: (batch, dim) or (batch, dim, seqlen)
-    conv_state: (batch, dim, state_len), where state_len >= width - 1
-    weight: (dim, width)
-    bias: (dim,)
-    cache_seqlens: (batch,), dtype int32.
-        If not None, the conv_state is treated as a circular buffer.
-        The conv_state will be updated by copying x to the conv_state starting at the index
-        @cache_seqlens % state_len.
-    conv_state_indices: (batch,), dtype int32
-        If None, the conv_state is a larger tensor along the batch dim,
-        and we are selecting the batch coords specified by conv_state_indices.
-        Useful for a continuous batching scenario.
-    out: (batch, dim) or (batch, dim, seqlen)
-    """
-    if activation not in [None, "silu", "swish"]:
-        raise NotImplementedError("activation must be None, silu, or swish")
-    activation = activation in ["silu", "swish"]
-    unsqueeze = x.dim() == 2
-    if unsqueeze:
-        x = x.unsqueeze(-1)
-    out = causal_conv1d_update_function(
-        x, conv_state, weight, bias, activation, cache_seqlens, conv_state_indices
-    )
-    if unsqueeze:
-        out = out.squeeze(-1)
-    return out
-def causal_conv1d_update_ref(x, conv_state, weight, bias=None, activation=None, cache_seqlens=None):
-    """
-    x: (batch, dim) or (batch, dim, seqlen)
-    conv_state: (batch, dim, state_len), where state_len >= width - 1
-    weight: (dim, width)
-    bias: (dim,)
-    cache_seqlens: (batch,), dtype int32.
-        If not None, the conv_state is treated as a circular buffer.
-        The conv_state will be updated by copying x to the conv_state starting at the index
-        @cache_seqlens % state_len before performing the convolution.
-    out: (batch, dim) or (batch, dim, seqlen)
-    """
-    if activation not in [None, "silu", "swish"]:
-        raise NotImplementedError("activation must be None, silu, or swish")
-    dtype_in = x.dtype
-    unsqueeze = x.dim() == 2
-    if unsqueeze:
-        x = x.unsqueeze(-1)
-    batch, dim, seqlen = x.shape
-    width = weight.shape[1]
-    state_len = conv_state.shape[-1]
-    assert conv_state.shape == (batch, dim, state_len)
-    assert weight.shape == (dim, width)
-    if cache_seqlens is None:
-        x_new = torch.cat([conv_state, x], dim=-1).to(weight.dtype)  # (batch, dim, state_len + seqlen)
-        conv_state.copy_(x_new[:, :, -state_len:])
-    else:
-        width_idx = torch.arange(-(width - 1), 0, dtype=torch.long, device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
-        x_new = torch.cat([conv_state.gather(2, width_idx), x], dim=-1).to(weight.dtype)
-        copy_idx = torch.arange(seqlen, dtype=torch.long, device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        copy_idx = torch.remainder(copy_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
-        conv_state.scatter_(2, copy_idx, x)
-    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0, groups=dim)[:, :, -seqlen:]
-    if unsqueeze:
-        out = out.squeeze(-1)
-    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)

torch-ext/causal_conv1d/causal_conv1d_varlen.py DELETED Viewed

@@ -1,86 +0,0 @@
-import torch
-from torch import Tensor
-import triton
-import triton.language as tl
-@triton.jit
-def _causal_conv1d_varlen_states(
-    X,
-    CU_SEQLENS,
-    STATES,
-    state_len,
-    dim,
-    stride_x_seqlen, stride_x_dim,
-    stride_states_batch, stride_states_seqlen, stride_states_dim,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr
-):
-    batch_idx = tl.program_id(2)
-    STATES += batch_idx * stride_states_batch
-    end_idx = tl.load(CU_SEQLENS + batch_idx + 1)
-    start_idx = tl.maximum(tl.load(CU_SEQLENS + batch_idx), end_idx - state_len)
-    rows = end_idx - (tl.program_id(1) + 1) * BLOCK_M + tl.arange(0, BLOCK_M)
-    cols = tl.program_id(0) * BLOCK_N + tl.arange(0, BLOCK_N)
-    x = tl.load(X + rows[:, None] * stride_x_seqlen + cols[None, :] * stride_x_dim,
-                mask=(rows[:, None] >= start_idx) & (cols[None, :] < dim),
-                other=0)
-    rows_states = state_len - (tl.program_id(1) + 1) * BLOCK_M + tl.arange(0, BLOCK_M)
-    tl.store(STATES + rows_states[:, None] * stride_states_seqlen + cols[None, :] * stride_states_dim,
-             x,
-             mask=(rows_states[:, None] >= 0) & (cols[None, :] < dim))
-def causal_conv1d_varlen_states(x: Tensor, cu_seqlens: Tensor, state_len: int) -> Tensor:
-    """
-    Forward pass only, does not support backward pass.
-    Parameters:
-        x: (total_tokens, dim)
-        cu_seqlens: (batch + 1), must already be sorted. The cumulative sum of the sequence lengths, starting from 0.
-        state_len: int. For each cu_seqlens, how many elements from x should be copied to the state.
-            If some of those elements belong to a different sequence, the value of the states will be zero.
-    Return:
-        states: (batch, dim, state_len)
-    """
-    _, dim = x.shape
-    batch = cu_seqlens.shape[0] - 1
-    cu_seqlens = cu_seqlens.contiguous()
-    states = torch.empty(batch, state_len, dim, dtype=x.dtype, device=x.device).transpose(1, 2)
-    BLOCK_M = min(triton.next_power_of_2(state_len), 16)
-    BLOCK_N = min(triton.next_power_of_2(dim), 256)
-    grid = (triton.cdiv(dim, BLOCK_N), triton.cdiv(state_len, BLOCK_M), batch)
-    with torch.cuda.device(x.device.index):
-        _causal_conv1d_varlen_states[grid](
-            x,
-            cu_seqlens,
-            states,
-            state_len,
-            dim,
-            x.stride(0), x.stride(1),
-            states.stride(0), states.stride(2), states.stride(1),
-            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N
-        )
-    return states
-def causal_conv1d_varlen_states_ref(x: Tensor, cu_seqlens: Tensor, state_len: int) -> Tensor:
-    """
-    Forward pass only, does not support backward pass.
-    Parameters:
-        x: (total_tokens, dim)
-        cu_seqlens: (batch + 1), must already be sorted. The cumulative sum of the sequence lengths, starting from 0.
-        state_len: int. For each cu_seqlens, how many elements from x should be copied to the state.
-            If some of those elements belong to a different sequence, the value of the states will be zero.
-    Return:
-        states: (batch, dim, state_len)
-    """
-    _, dim = x.shape
-    batch = cu_seqlens.shape[0] - 1
-    cu_seqlens = cu_seqlens.contiguous()
-    states = torch.zeros(batch, state_len, dim, dtype=x.dtype, device=x.device).transpose(1, 2)
-    for i in range(batch):
-        end_idx = cu_seqlens[i + 1]
-        start_idx = torch.maximum(cu_seqlens[i], end_idx - state_len)
-        states[i, :, -(end_idx - start_idx):] = x[start_idx:end_idx].T
-    return states

torch-ext/causal_conv1d/cpp_functions.py DELETED Viewed

@@ -1,96 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-import torch
-from ._ops import ops
-def causal_conv1d_fwd_function(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor | None,
-    seq_idx: torch.Tensor | None,
-    initial_states: torch.Tensor | None,
-    final_states_out: torch.Tensor | None,
-    silu_activation: bool,
-) -> torch.Tensor:
-    out = torch.empty_like(x)
-    ops.causal_conv1d_fwd(
-        x=x,
-        weight=weight,
-        bias=bias,
-        seq_idx=seq_idx,
-        initial_states=initial_states,
-        out=out,
-        final_states_out=final_states_out,
-        silu_activation=silu_activation,
-    )
-    return out
-def causal_conv1d_bwd_function(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor | None,
-    dout: torch.Tensor,
-    seq_idx: torch.Tensor | None,
-    initial_states: torch.Tensor | None,
-    dfinal_states: torch.Tensor | None,
-    dx: torch.Tensor | None,
-    return_dinitial_states: torch.Tensor,
-    silu_activation: bool,
-) -> tuple[torch.Tensor | None]:
-    batch_size, dim = x.size()[:2]
-    width = weight.size(-1)
-    if dx is None:
-        dx = torch.empty_like(x)
-    dweight = torch.zeros_like(weight, dtype=torch.float32)
-    dbias = None
-    if bias is not None:
-        dbias = torch.zeros_like(bias, dtype=torch.float32)
-    dinitial_states = None
-    if return_dinitial_states:
-        dinitial_states = torch.empty(batch_size, width - 1, dim, device=x.device, dtype=x.dtype).transpose(1, 2)
-    ops.causal_conv1d_bwd(
-        x=x,
-        weight=weight,
-        bias=bias,
-        dout=dout,
-        seq_idx=seq_idx,
-        initial_states=initial_states,
-        dfinal_states=dfinal_states,
-        dx=dx,
-        dweight=dweight,
-        dbias=dbias,
-        dinitial_states=dinitial_states,
-        silu_activation=silu_activation,
-    )
-    dweight = dweight.type_as(weight)
-    if dbias is not None:
-        dbias = dbias.type_as(bias)
-    return dx, dweight, dbias, dinitial_states
-def causal_conv1d_update_function(
-    x: torch.Tensor,
-    conv_state: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor | None,
-    silu_activation: bool,
-    cache_seqlens: torch.Tensor | None,
-    conv_state_indices: torch.Tensor | None,
-) -> torch.Tensor:
-    out = torch.empty_like(x)
-    ops.causal_conv1d_update(
-        x=x,
-        conv_state=conv_state,
-        weight=weight,
-        bias=bias,
-        out=out,
-        silu_activation=silu_activation,
-        cache_seqlens=cache_seqlens,
-        conv_state_indices=conv_state_indices,
-    )
-    return out

torch-ext/pytorch_shim.h DELETED Viewed

@@ -1,105 +0,0 @@
-#pragma once
-#include <torch/library.h>
-/**
- * Unforunately, the type signatures of the flash_attn ops are not compatible
- * with the PyTorch library bindings. To get around that we use
- * `make_pytorch_shim` which creates a lambda that exponses the API using
- * PyTorch compatible types to the types, then converts them to the types
- * expected by the flash_attn ops. This shims allows us to make minimal changes
- * to `flash_api.cpp` making it easier to synchronize with upstream changes.
- *
- * The `pytorch_library_compatible_type` struct is used to map from the
- * flash_attn ops types to a PyTorch library compatible one. The main issues is
- * that the following types are not support by PyTorch libary bindings:
- *  - `int`
- *  - `float`
- *  - `std::optional<T> &`
- *  - `std::optional<const at::Tensor> &`
- * So we convert them to (respectively):
- *  - `int64_t`
- *  - `double`
- *  - `const std::optional<T>&`
- *  - `const std::optional<at::Tensor>&`
- */
-template<typename T>
-struct pytorch_library_compatible_type {
-    using type = T;
-    static T convert_from_type(T arg) { return arg; }
-};
-template<typename T>
-using pytorch_library_compatible_type_t = \
-    typename pytorch_library_compatible_type<T>::type;
-template<typename T>
-T convert_from_pytorch_compatible_type(pytorch_library_compatible_type_t<T> arg)
-    { return pytorch_library_compatible_type<T>::convert_from_type(arg); }
-// Map `std::optional<T> &` -> `const std::optional<T>&`
-//  (NOTE: this is bit unsafe but non of the ops in flash_attn mutate
-//   the optional container)
-template<typename T>
-struct pytorch_library_compatible_type<std::optional<T> &> {
-    using type = const std::optional<T>&;
-    static std::optional<T>& convert_from_type(const std::optional<T> &arg) {
-        return const_cast<std::optional<T>&>(arg);
-    }
-};
-// Map `std::optional<T>` ->
-//          `std::optional<pytorch_library_compatible_type_t<T>>`
-//  (NOTE: tested for `std::optional<int>` -> `std::optional<int64_t>`)
-template<typename T>
-struct pytorch_library_compatible_type<std::optional<T>> {
-    using type = std::optional<pytorch_library_compatible_type_t<T>>;
-    static std::optional<pytorch_library_compatible_type_t<T>> convert_from_type(std::optional<T> arg) {
-        return arg;
-    }
-};
-// Map `std::optional<const at::Tensor>&` -> `const std::optional<at::Tensor>&`
-template<>
-struct pytorch_library_compatible_type<std::optional<const at::Tensor> &> {
-    using type = const std::optional<at::Tensor>&;
-    static std::optional<const at::Tensor>& convert_from_type(
-        const std::optional<at::Tensor> &arg) {
-        return const_cast<std::optional<const at::Tensor>&>(
-            reinterpret_cast<const std::optional<const at::Tensor>&>(arg));
-    }
-};
-// Map `int` -> `int64_t`
-template<> struct pytorch_library_compatible_type<int> {
-    using type = int64_t;
-    static int convert_from_type(int64_t arg) {
-        TORCH_CHECK(arg <= std::numeric_limits<int>::max(),
-            "int64_t value is too large to be converted to int");
-        TORCH_CHECK(arg >= std::numeric_limits<int>::min(),
-            "int64_t value is too small to be converted to int");
-        return arg;
-    }
-};
-// Map `float` -> `double`
-template<> struct pytorch_library_compatible_type<float> {
-    using type = double;
-    static float convert_from_type(double arg) {
-        TORCH_CHECK(std::abs(arg) <= std::numeric_limits<float>::max(),
-            "double value is too large to be converted to float");
-        return arg;
-    }
-};
-//
-//  Shim Utils
-//
-template <typename Ret, typename... Args>
-auto make_pytorch_shim(Ret(*fun)(Args... args)){
-    return [fun](pytorch_library_compatible_type_t<Args>... args) {
-        return fun(convert_from_pytorch_compatible_type<Args>(args)...);
-    };
-}

torch-ext/torch_binding.cpp DELETED Viewed

@@ -1,32 +0,0 @@
-#include <torch/library.h>
-#include "registration.h"
-#include "pytorch_shim.h"
-#include "torch_binding.h"
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-    ops.def(
-        "causal_conv1d_fwd("
-        "    Tensor x, Tensor weight, Tensor? bias, Tensor? seq_idx,"
-        "    Tensor? initial_states, Tensor! out, Tensor!? final_states_out,"
-        "    bool silu_activation) -> ()");
-    ops.impl("causal_conv1d_fwd", torch::kCUDA, make_pytorch_shim(&causal_conv1d_fwd));
-    ops.def(
-        "causal_conv1d_bwd("
-        "    Tensor x, Tensor weight, Tensor? bias, Tensor! dout,"
-        "    Tensor? seq_idx, Tensor? initial_states, Tensor? dfinal_states,"
-        "    Tensor! dx, Tensor! dweight, Tensor!? dbias,"
-        "    Tensor!? dinitial_states, bool silu_activation) -> ()");
-    ops.impl("causal_conv1d_bwd", torch::kCUDA, make_pytorch_shim(&causal_conv1d_bwd));
-    ops.def(
-        "causal_conv1d_update("
-        "    Tensor x, Tensor conv_state, Tensor weight, Tensor? bias,"
-        "    Tensor! out, bool silu_activation, Tensor? cache_seqlens,"
-        "    Tensor? conv_state_indices) -> ()");
-    ops.impl("causal_conv1d_update", torch::kCUDA, make_pytorch_shim(&causal_conv1d_update));
-}
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h DELETED Viewed

@@ -1,39 +0,0 @@
-#pragma once
-#include <torch/torch.h>
-void
-causal_conv1d_fwd(const at::Tensor &x,
-                  const at::Tensor &weight,
-                  const c10::optional<at::Tensor> &bias_,
-                  const c10::optional<at::Tensor> &seq_idx_,
-                  const c10::optional<at::Tensor> &initial_states_,
-                  at::Tensor &out,
-                  c10::optional<at::Tensor> &final_states_out_,
-                  bool silu_activation);
-void
-causal_conv1d_bwd(const at::Tensor &x,
-                  const at::Tensor &weight,
-                  const c10::optional<at::Tensor> &bias_,
-                  at::Tensor &dout,
-                  const c10::optional<at::Tensor> &seq_idx_,
-                  const c10::optional<at::Tensor> &initial_states_,
-                  const c10::optional<at::Tensor> &dfinal_states_,
-                  at::Tensor &dx,
-                  at::Tensor &dweight,
-                  c10::optional<at::Tensor> &dbias_,
-                  c10::optional<at::Tensor> &dinitial_states_,
-                  bool silu_activation);
-void
-causal_conv1d_update(const at::Tensor &x,
-                     const at::Tensor &conv_state,
-                     const at::Tensor &weight,
-                     const c10::optional<at::Tensor> &bias_,
-                     at::Tensor &out,
-                     bool silu_activation,
-                     const c10::optional<at::Tensor> &cache_seqlens_,
-                     const c10::optional<at::Tensor> &conv_state_indices_
-                     );