add configs to specify passes in compiler

yiming0416 · yiming0416 · commit 1fdba5c739e3 · 2025-11-10T10:24:12.000-08:00
diff --git a/torchtitan/experiments/compiler_toolkit/README.md b/torchtitan/experiments/compiler_toolkit/README.md
@@ -29,7 +29,18 @@ NGPU=4 CONFIG_FILE=./torchtitan/models/deepseek_v3/train_configs/debug_model.tom
 NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4
 ```
 
+**SimpleFSDP + TP + auto-bucketing**
+```shell
+NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config --compile.passes autobucketing_reordering
+```
+
 **SimpleFSDP + TP + FlexAttention**
 ```shell
 NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --model.flavor=debugmodel_flex_attn
 ```
+
+**SimpleFSDP + TP + FlexAttention + auto-bucketing + regional-inductor**
+
+```shell
+NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config --compile.passes autobucketing_reordering,regional_inductor
+```
diff --git a/torchtitan/experiments/compiler_toolkit/common_utils.py b/torchtitan/experiments/compiler_toolkit/common_utils.py
@@ -53,3 +53,13 @@ def register_blockmask_pytree_node():
             flatten_with_keys_fn=BlockMask._flatten_with_keys,
             serialized_type_name="torch.nn.attention.flex_attention.BlockMask",
         )
+
+
+def validate_flex_attention_annotation(joint_with_descriptors):
+    """Verify user annotations show up in the graph."""
+    for node in joint_with_descriptors.graph_module.graph.nodes:
+        if node.target in {
+            torch.ops.higher_order.flex_attention,
+            torch.ops.higher_order.flex_attention_backward,
+        }:
+            assert "compile_with_inductor" in node.meta.get("custom", {})
diff --git a/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py b/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py
@@ -17,37 +17,19 @@
     disable_compile,
     parallelize_inputs,
     register_blockmask_pytree_node,
+    validate_flex_attention_annotation,
 )
 
 from torchtitan.experiments.compiler_toolkit.graph_utils import (
     CompiledModule,
+    get_compiler_passes_from_config,
     joint_graph_builder,
+    make_compiler_with_passes,
 )
 
 from torchtitan.experiments.simple_fsdp.deepseek_v3.parallelize import (
     parallelize_deepseekv3 as simple_fsdp_parallelize_deepseekv3,
 )
-from torchtitan.tools.logging import logger
-
-
-def compiler(name: str, gm: torch.fx.GraphModule, example_inputs):
-    logger.info(f"{name} before compiler:")
-    logger.info(gm.print_readable(print_output=False))
-
-    # TODO: regional_inductor should work with deepseek_v3
-    # gm = regional_inductor(gm, example_inputs)
-
-    logger.info(f"{name} after compiler:")
-    logger.info(gm.print_readable(print_output=False))
-    return gm
-
-
-def fw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
-    return compiler("fwd_gm", gm, example_inputs)
-
-
-def bw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
-    return compiler("bwd_gm", gm, example_inputs)
 
 
 def annotate_deepseekv3() -> None:
@@ -75,7 +57,17 @@ def parallelize_deepseekv3(
     parallel_dims: ParallelDims,
     job_config: JobConfig,
 ) -> CompiledModule:
+    """
+    Parallelize and compile a DeepSeek v3 model with optional custom compiler passes.
+
+    Args:
+        model: The model to parallelize
+        parallel_dims: Parallel dimensions configuration
+        job_config: Job configuration
 
+    Returns:
+        CompiledModule wrapping the parallelized and compiled model
+    """
     annotate_deepseekv3()
 
     register_blockmask_pytree_node()
@@ -84,11 +76,18 @@ def parallelize_deepseekv3(
     with disable_compile(job_config):
         model = simple_fsdp_parallelize_deepseekv3(model, parallel_dims, job_config)
 
+    # Get compiler passes from config
+    compiler_passes = get_compiler_passes_from_config(job_config)
+
+    # Create compilers with specified passes (defaults to no passes)
+    fw_compiler, bw_compiler = make_compiler_with_passes(compiler_passes)
+
+    # Create custom joint_graph_builder with deepseekv3-specific compilers
     deepseekv3_joint_graph_builder = functools.partial(
         joint_graph_builder,
         fw_compiler=fw_compiler,
         bw_compiler=bw_compiler,
-        joint_custom_pass=None,
+        joint_custom_pass=validate_flex_attention_annotation,
     )
 
     # TODO: CompiledModule should take sample input as well, so that we can
diff --git a/torchtitan/experiments/compiler_toolkit/graph_utils.py b/torchtitan/experiments/compiler_toolkit/graph_utils.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
-from typing import Callable, Optional
+from typing import Callable, List, Optional
 
 import torch
 from torch._dynamo.functional_export import dynamo_graph_capture_for_export
@@ -16,6 +16,7 @@
 )
 from torch._guards import tracing, TracingContext
 from torch.distributed.tensor import DTensor
+from torchtitan.config import JobConfig
 from torchtitan.distributed import ParallelDims
 from torchtitan.tools.logging import logger
 
@@ -180,3 +181,88 @@ def forward(self, *args, **kwargs):
         # calling the line below returns control to torchtitan's runner
         # letting it call the backward, and optimizer.
         return self.joint_graph_module(args, kwargs)
+
+
+# Default compiler pass configuration - no passes by default
+DEFAULT_COMPILER_PASSES = []
+
+
+def compiler(
+    name: str,
+    gm: torch.fx.GraphModule,
+    example_inputs,
+    passes: List[Callable] = None,
+):
+    """
+    Compile a graph module by applying a sequence of compiler passes.
+
+    Args:
+        name: Name for logging purposes
+        gm: The graph module to compile
+        example_inputs: Example inputs for the graph module
+        passes: List of compiler pass functions to apply. Each function should take
+                (gm, example_inputs) and return a transformed gm. If None, uses
+                DEFAULT_COMPILER_PASSES.
+    """
+    if passes is None:
+        passes = DEFAULT_COMPILER_PASSES
+
+    logger.info(f"{name} before compiler:")
+    logger.info(gm.print_readable(print_output=False))
+
+    for pass_fn in passes:
+        logger.info(f"Applying pass: {pass_fn.__name__}")
+        gm = pass_fn(gm, example_inputs)
+
+    logger.info(f"{name} after compiler:")
+    logger.info(gm.print_readable(print_output=False))
+    return gm
+
+
+def make_compiler_with_passes(passes: List[Callable] = None):
+    """
+    Create forward and backward compilers with specified passes.
+
+    Args:
+        passes: List of compiler pass functions to apply. If None, uses DEFAULT_COMPILER_PASSES.
+
+    Returns:
+        Tuple of (fw_compiler, bw_compiler) functions
+    """
+
+    def fw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
+        return compiler("fwd_gm", gm, example_inputs, passes=passes)
+
+    def bw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
+        return compiler("bwd_gm", gm, example_inputs, passes=passes)
+
+    return fw_compiler, bw_compiler
+
+
+def get_compiler_passes_from_config(job_config: JobConfig):
+    """
+    Extract and validate compiler passes from job config.
+
+    Args:
+        job_config: Job configuration containing compile.passes
+
+    Returns:
+        List of compiler pass functions
+    """
+    from torchtitan.experiments.compiler_toolkit.passes import AVAILABLE_PASSES
+
+    pass_names = getattr(job_config.compile, "passes", [])
+    compiler_passes = []
+
+    for pass_name in pass_names:
+        if pass_name not in AVAILABLE_PASSES:
+            raise ValueError(
+                f"Unknown compiler pass: {pass_name}. "
+                f"Available passes: {list(AVAILABLE_PASSES.keys())}"
+            )
+        compiler_passes.append(AVAILABLE_PASSES[pass_name])
+
+    if pass_names:
+        logger.info(f"Using compiler passes from config: {pass_names}")
+
+    return compiler_passes
diff --git a/torchtitan/experiments/compiler_toolkit/job_config.py b/torchtitan/experiments/compiler_toolkit/job_config.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Compile:
+    """
+    List of compiler pass names to apply in the compiler toolkit workflow.
+    By default, no passes are applied.
+    Example: --compile.passes autobucketing_reordering,regional_inductor
+    """
+
+    passes: list[str] = field(default_factory=list)
+
+
+@dataclass
+class JobConfig:
+    compile: Compile = field(default_factory=Compile)
diff --git a/torchtitan/experiments/compiler_toolkit/llama3/parallelize.py b/torchtitan/experiments/compiler_toolkit/llama3/parallelize.py
@@ -8,9 +8,6 @@
 import functools
 
 import torch
-from torch._inductor.fx_passes.overlap_scheduling import schedule_overlap_bucketing
-
-from torch.fx.passes.regional_inductor import regional_inductor
 from torch.fx.traceback import annotate_fn
 
 from torchtitan.config import JobConfig
@@ -19,56 +16,19 @@
     disable_compile,
     parallelize_inputs,
     register_blockmask_pytree_node,
+    validate_flex_attention_annotation,
 )
 
 from torchtitan.experiments.compiler_toolkit.graph_utils import (
     CompiledModule,
+    get_compiler_passes_from_config,
     joint_graph_builder,
+    make_compiler_with_passes,
 )
 from torchtitan.experiments.simple_fsdp.llama3.parallelize import (
     parallelize_llama as simple_fsdp_parallelize_llama,
 )
 
-from torchtitan.tools.logging import logger
-
-
-# TODO: support passing configs into schedule_overlap_bucketing
-def autobucketing_reordering_pass(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    schedule_overlap_bucketing(gm, collective_bucketing=True)
-    gm.recompile()
-    return gm
-
-
-def compiler(name: str, gm: torch.fx.GraphModule, example_inputs):
-    logger.info(f"{name} before compiler:")
-    logger.info(gm.print_readable(print_output=False))
-
-    gm = autobucketing_reordering_pass(gm)
-
-    gm = regional_inductor(gm, example_inputs)
-
-    logger.info(f"{name} after compiler:")
-    logger.info(gm.print_readable(print_output=False))
-    return gm
-
-
-def fw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
-    return compiler("fwd_gm", gm, example_inputs)
-
-
-def bw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
-    return compiler("bwd_gm", gm, example_inputs)
-
-
-def validate_flex_attention_annotation(joint_with_descriptors):
-    """Verify user annotations show up in the graph."""
-    for node in joint_with_descriptors.graph_module.graph.nodes:
-        if node.target in {
-            torch.ops.higher_order.flex_attention,
-            torch.ops.higher_order.flex_attention_backward,
-        }:
-            assert "compile_with_inductor" in node.meta.get("custom", {})
-
 
 def annotate_llama() -> None:
     from torchtitan.models.attention import FlexAttentionWrapper
@@ -84,7 +44,17 @@ def parallelize_llama(
     parallel_dims: ParallelDims,
     job_config: JobConfig,
 ) -> CompiledModule:
+    """
+    Parallelize and compile a Llama model with optional custom compiler passes.
+
+    Args:
+        model: The model to parallelize
+        parallel_dims: Parallel dimensions configuration
+        job_config: Job configuration
 
+    Returns:
+        CompiledModule wrapping the parallelized and compiled model
+    """
     annotate_llama()
 
     register_blockmask_pytree_node()
@@ -93,6 +63,12 @@ def parallelize_llama(
     with disable_compile(job_config):
         model = simple_fsdp_parallelize_llama(model, parallel_dims, job_config)
 
+    # Get compiler passes from config
+    compiler_passes = get_compiler_passes_from_config(job_config)
+
+    # Create compilers with specified passes (defaults to no passes)
+    fw_compiler, bw_compiler = make_compiler_with_passes(compiler_passes)
+
     # Create custom joint_graph_builder with llama-specific compilers and validation
     llama_joint_graph_builder = functools.partial(
         joint_graph_builder,
diff --git a/torchtitan/experiments/compiler_toolkit/passes.py b/torchtitan/experiments/compiler_toolkit/passes.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Compiler passes for the compiler toolkit.
+
+This module provides various compiler passes that can be applied to graph modules
+during compilation. Passes can be selected and configured via job config.
+"""
+
+import torch
+from torch._inductor.fx_passes.overlap_scheduling import schedule_overlap_bucketing
+from torch.fx.passes.regional_inductor import regional_inductor
+
+
+def autobucketing_reordering_pass(
+    gm: torch.fx.GraphModule, example_inputs=None
+) -> torch.fx.GraphModule:
+    """
+    Apply autobucketing and reordering optimization.
+
+    This pass applies schedule_overlap_bucketing with collective_bucketing enabled
+    to optimize communication patterns in distributed training.
+    """
+    schedule_overlap_bucketing(gm, collective_bucketing=True)
+    gm.recompile()
+    return gm
+
+
+def regional_inductor_pass(
+    gm: torch.fx.GraphModule, example_inputs
+) -> torch.fx.GraphModule:
+    """
+    Apply regional inductor compilation based on user annotation.
+    """
+    return regional_inductor(gm, example_inputs)
+
+
+# Registry mapping pass names to pass functions
+AVAILABLE_PASSES = {
+    "autobucketing_reordering": autobucketing_reordering_pass,
+    "regional_inductor": regional_inductor_pass,
+}
diff --git a/torchtitan/experiments/compiler_toolkit/tests/integration_tests.py b/torchtitan/experiments/compiler_toolkit/tests/integration_tests.py