vllm.model_executor.layers.fused_moe.fused_moe_method_base ¶

logger `module-attribute` ¶

logger = init_logger(__name__)

FusedMoEMethodBase ¶

Bases: QuantizeMethodBase

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

class FusedMoEMethodBase(QuantizeMethodBase):
    def __init__(self, moe: FusedMoEConfig):
        super().__init__()
        self.moe: FusedMoEConfig = moe
        self.moe_quant_config: FusedMoEQuantConfig | None = None

    @property
    def supports_mk_interally(self) -> bool:
        """
        Returns True if this method supports using modular kernels (MK)
        internally for MoE operations, False otherwise.

        This method should be overridden by subclasses that support
        modular kernels internally.
        """
        return False

    @abstractmethod
    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        raise NotImplementedError

    def uses_weight_scale_2_pattern(self) -> bool:
        """
        Returns True if this quantization method uses 'weight_scale_2' pattern
        for per-tensor weight scales (e.g., FP4 variants), False otherwise.

        This method should be overridden by subclasses that use the
        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
        """
        return False

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> FusedMoEPrepareAndFinalize | None:
        from .all2all_utils import maybe_make_prepare_finalize

        return maybe_make_prepare_finalize(
            self.moe, self.moe_quant_config, routing_tables
        )

    def select_gemm_impl(
        self,
        prepare_finalize: FusedMoEPrepareAndFinalize,
        layer: torch.nn.Module,
    ) -> FusedMoEPermuteExpertsUnpermute:
        # based on the all2all implementation, select the appropriate
        # gemm implementation
        raise NotImplementedError(
            f"{self.__class__.__name__} must select appropriate gemm "
            "implementation based on the prepare_finalize"
        )

    def prepare_dp_allgather_tensor(
        self,
        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
        """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
        raise NotImplementedError(
            "Method 'prepare_dp_allgather_tensor' is not implemented in "
            f"{self.__class__.__name__}."
        )

    @abstractmethod
    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        raise NotImplementedError

    @property
    def topk_indices_dtype(self) -> torch.dtype | None:
        return None

    @property
    def supports_eplb(self) -> bool:
        return False

    @property
    def allow_inplace(self) -> bool:
        return False

    @property
    def method_name(self) -> str:
        return self.__class__.__name__

    @abstractmethod
    def apply(
        self,
        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
        router: FusedMoERouter,
        x: torch.Tensor,
        router_logits: torch.Tensor,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError

allow_inplace `property` ¶

allow_inplace: bool

method_name `property` ¶

method_name: str

moe `instance-attribute` ¶

moe: FusedMoEConfig = moe

moe_quant_config `instance-attribute` ¶

moe_quant_config: FusedMoEQuantConfig | None = None

supports_eplb `property` ¶

supports_eplb: bool

supports_mk_interally `property` ¶

supports_mk_interally: bool

Returns True if this method supports using modular kernels (MK) internally for MoE operations, False otherwise.

This method should be overridden by subclasses that support modular kernels internally.

topk_indices_dtype `property` ¶

topk_indices_dtype: dtype | None

init ¶

__init__(moe: FusedMoEConfig)

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def __init__(self, moe: FusedMoEConfig):
    super().__init__()
    self.moe: FusedMoEConfig = moe
    self.moe_quant_config: FusedMoEQuantConfig | None = None

apply `abstractmethod` ¶

apply(
    layer: FusedMoE,
    router: FusedMoERouter,
    x: Tensor,
    router_logits: Tensor,
) -> Tensor | tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

@abstractmethod
def apply(
    self,
    layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
    router: FusedMoERouter,
    x: torch.Tensor,
    router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    raise NotImplementedError

create_weights `abstractmethod` ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

@abstractmethod
def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    raise NotImplementedError

get_fused_moe_quant_config `abstractmethod` ¶

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

@abstractmethod
def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    raise NotImplementedError

maybe_make_prepare_finalize ¶

maybe_make_prepare_finalize(
    routing_tables: tuple[Tensor, Tensor, Tensor]
    | None = None,
) -> FusedMoEPrepareAndFinalize | None

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def maybe_make_prepare_finalize(
    self,
    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> FusedMoEPrepareAndFinalize | None:
    from .all2all_utils import maybe_make_prepare_finalize

    return maybe_make_prepare_finalize(
        self.moe, self.moe_quant_config, routing_tables
    )

prepare_dp_allgather_tensor ¶

prepare_dp_allgather_tensor(
    layer: FusedMoE,
    hidden_states: Tensor,
    router_logits: Tensor,
) -> tuple[Tensor, list[Tensor]]

Hook to prepare tensors and extra tensors for DP allgather + EP dispatch.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def prepare_dp_allgather_tensor(
    self,
    layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
) -> tuple[torch.Tensor, list[torch.Tensor]]:
    """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
    raise NotImplementedError(
        "Method 'prepare_dp_allgather_tensor' is not implemented in "
        f"{self.__class__.__name__}."
    )

select_gemm_impl ¶

select_gemm_impl(
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: Module,
) -> FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def select_gemm_impl(
    self,
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: torch.nn.Module,
) -> FusedMoEPermuteExpertsUnpermute:
    # based on the all2all implementation, select the appropriate
    # gemm implementation
    raise NotImplementedError(
        f"{self.__class__.__name__} must select appropriate gemm "
        "implementation based on the prepare_finalize"
    )

uses_weight_scale_2_pattern ¶

uses_weight_scale_2_pattern() -> bool

Returns True if this quantization method uses 'weight_scale_2' pattern for per-tensor weight scales (e.g., FP4 variants), False otherwise.

This method should be overridden by subclasses that use the 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def uses_weight_scale_2_pattern(self) -> bool:
    """
    Returns True if this quantization method uses 'weight_scale_2' pattern
    for per-tensor weight scales (e.g., FP4 variants), False otherwise.

    This method should be overridden by subclasses that use the
    'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
    """
    return False

vllm.model_executor.layers.fused_moe.fused_moe_method_base ¶

logger module-attribute ¶

FusedMoEMethodBase ¶

allow_inplace property ¶

method_name property ¶

moe instance-attribute ¶

moe_quant_config instance-attribute ¶

supports_eplb property ¶

supports_mk_interally property ¶

topk_indices_dtype property ¶

__init__ ¶

apply abstractmethod ¶

create_weights abstractmethod ¶

get_fused_moe_quant_config abstractmethod ¶

maybe_make_prepare_finalize ¶

prepare_dp_allgather_tensor ¶

select_gemm_impl ¶

uses_weight_scale_2_pattern ¶

logger `module-attribute` ¶

allow_inplace `property` ¶

method_name `property` ¶

moe `instance-attribute` ¶

moe_quant_config `instance-attribute` ¶

supports_eplb `property` ¶

supports_mk_interally `property` ¶

topk_indices_dtype `property` ¶

init ¶

apply `abstractmethod` ¶

create_weights `abstractmethod` ¶

get_fused_moe_quant_config `abstractmethod` ¶