Skip to content

vllm.model_executor.layers.fused_moe.fused_moe_method_base

logger module-attribute

logger = init_logger(__name__)

FusedMoEMethodBase

Bases: QuantizeMethodBase

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
class FusedMoEMethodBase(QuantizeMethodBase):
    def __init__(self, moe: FusedMoEConfig):
        super().__init__()
        self.moe: FusedMoEConfig = moe
        self.moe_quant_config: FusedMoEQuantConfig | None = None

    @property
    def supports_mk_interally(self) -> bool:
        """
        Returns True if this method supports using modular kernels (MK)
        internally for MoE operations, False otherwise.

        This method should be overridden by subclasses that support
        modular kernels internally.
        """
        return False

    @abstractmethod
    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        raise NotImplementedError

    def uses_weight_scale_2_pattern(self) -> bool:
        """
        Returns True if this quantization method uses 'weight_scale_2' pattern
        for per-tensor weight scales (e.g., FP4 variants), False otherwise.

        This method should be overridden by subclasses that use the
        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
        """
        return False

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> FusedMoEPrepareAndFinalize | None:
        from .all2all_utils import maybe_make_prepare_finalize

        return maybe_make_prepare_finalize(
            self.moe, self.moe_quant_config, routing_tables
        )

    def select_gemm_impl(
        self,
        prepare_finalize: FusedMoEPrepareAndFinalize,
        layer: torch.nn.Module,
    ) -> FusedMoEPermuteExpertsUnpermute:
        # based on the all2all implementation, select the appropriate
        # gemm implementation
        raise NotImplementedError(
            f"{self.__class__.__name__} must select appropriate gemm "
            "implementation based on the prepare_finalize"
        )

    def prepare_dp_allgather_tensor(
        self,
        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
        """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
        raise NotImplementedError(
            "Method 'prepare_dp_allgather_tensor' is not implemented in "
            f"{self.__class__.__name__}."
        )

    @abstractmethod
    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        raise NotImplementedError

    @property
    def topk_indices_dtype(self) -> torch.dtype | None:
        return None

    @property
    def supports_eplb(self) -> bool:
        return False

    @property
    def allow_inplace(self) -> bool:
        return False

    @property
    def method_name(self) -> str:
        return self.__class__.__name__

    @abstractmethod
    def apply(
        self,
        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
        router: FusedMoERouter,
        x: torch.Tensor,
        router_logits: torch.Tensor,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError

allow_inplace property

allow_inplace: bool

method_name property

method_name: str

moe instance-attribute

moe: FusedMoEConfig = moe

moe_quant_config instance-attribute

moe_quant_config: FusedMoEQuantConfig | None = None

supports_eplb property

supports_eplb: bool

supports_mk_interally property

supports_mk_interally: bool

Returns True if this method supports using modular kernels (MK) internally for MoE operations, False otherwise.

This method should be overridden by subclasses that support modular kernels internally.

topk_indices_dtype property

topk_indices_dtype: dtype | None

__init__

__init__(moe: FusedMoEConfig)
Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def __init__(self, moe: FusedMoEConfig):
    super().__init__()
    self.moe: FusedMoEConfig = moe
    self.moe_quant_config: FusedMoEQuantConfig | None = None

apply abstractmethod

apply(
    layer: FusedMoE,
    router: FusedMoERouter,
    x: Tensor,
    router_logits: Tensor,
) -> Tensor | tuple[Tensor, Tensor]
Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@abstractmethod
def apply(
    self,
    layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
    router: FusedMoERouter,
    x: torch.Tensor,
    router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    raise NotImplementedError

create_weights abstractmethod

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)
Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@abstractmethod
def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    raise NotImplementedError

get_fused_moe_quant_config abstractmethod

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None
Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@abstractmethod
def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    raise NotImplementedError

maybe_make_prepare_finalize

maybe_make_prepare_finalize(
    routing_tables: tuple[Tensor, Tensor, Tensor]
    | None = None,
) -> FusedMoEPrepareAndFinalize | None
Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def maybe_make_prepare_finalize(
    self,
    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> FusedMoEPrepareAndFinalize | None:
    from .all2all_utils import maybe_make_prepare_finalize

    return maybe_make_prepare_finalize(
        self.moe, self.moe_quant_config, routing_tables
    )

prepare_dp_allgather_tensor

prepare_dp_allgather_tensor(
    layer: FusedMoE,
    hidden_states: Tensor,
    router_logits: Tensor,
) -> tuple[Tensor, list[Tensor]]

Hook to prepare tensors and extra tensors for DP allgather + EP dispatch.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def prepare_dp_allgather_tensor(
    self,
    layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
) -> tuple[torch.Tensor, list[torch.Tensor]]:
    """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
    raise NotImplementedError(
        "Method 'prepare_dp_allgather_tensor' is not implemented in "
        f"{self.__class__.__name__}."
    )

select_gemm_impl

select_gemm_impl(
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: Module,
) -> FusedMoEPermuteExpertsUnpermute
Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def select_gemm_impl(
    self,
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: torch.nn.Module,
) -> FusedMoEPermuteExpertsUnpermute:
    # based on the all2all implementation, select the appropriate
    # gemm implementation
    raise NotImplementedError(
        f"{self.__class__.__name__} must select appropriate gemm "
        "implementation based on the prepare_finalize"
    )

uses_weight_scale_2_pattern

uses_weight_scale_2_pattern() -> bool

Returns True if this quantization method uses 'weight_scale_2' pattern for per-tensor weight scales (e.g., FP4 variants), False otherwise.

This method should be overridden by subclasses that use the 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def uses_weight_scale_2_pattern(self) -> bool:
    """
    Returns True if this quantization method uses 'weight_scale_2' pattern
    for per-tensor weight scales (e.g., FP4 variants), False otherwise.

    This method should be overridden by subclasses that use the
    'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
    """
    return False