use cache

jiahanc · jiahanc · commit 951f1a28ae14 · 2025-11-06T19:45:09.000-08:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -657,6 +657,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             from flashinfer.fused_moe.core import (
                 convert_to_block_layout,
                 get_w2_permute_indices_with_cache,
+                _maybe_get_cached_w3_w1_permute_indices,
             )
 
             # Swap halves to arrange as [w3; w1] (kernel expectation)
@@ -668,25 +669,25 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             # Reorder rows of W1 for fused gated activation
             w13_weights_bf16_shuffled = []
             w2_weights_bf16_shuffled = []
-            for i in range(self.moe.num_experts):
-                permute_indices = get_w2_permute_indices_with_cache(
+            for i in range(layer.local_num_experts):
+                permute_indices = _maybe_get_cached_w3_w1_permute_indices(
                     self._cache_permute_indices,
-                    layer.w13_weight.data[i].clone().view(torch.uint8),
+                    layer.w13_weight.data[i].view(torch.uint8),
                     epilogue_tile_m,
                 )
                 tmp_weights1 = (
-                    layer.w13_weight.data[i]
+                    layer.w13_weight.data[i].clone()
                     .view(torch.uint8)[permute_indices.to(layer.w13_weight.data.device)]
                     .contiguous()
                 )
 
                 permute_indices = get_w2_permute_indices_with_cache(
                     self._cache_permute_indices,
-                    layer.w2_weight.data[i].clone().view(torch.uint8),
+                    layer.w2_weight.data[i].view(torch.uint8),
                     epilogue_tile_m,
                 )
                 tmp_weights2 = (
-                    layer.w2_weight.data[i]
+                    layer.w2_weight.data[i].clone()
                     .view(torch.uint8)[permute_indices.to(layer.w2_weight.data.device)]
                     .contiguous()
                 )
@@ -1508,7 +1509,6 @@ def __init__(
                 )
             else:
                 self.routing_method_type = RoutingMethodType.TopK
-
         self.moe_config: FusedMoEConfig = FusedMoEConfig(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,